Skip to content
Snippets Groups Projects
Parser.h 2.41 KiB
Newer Older
  • Learn to ignore specific revisions
  • vcday's avatar
    vcday committed
    //
    // Created by Veronica Day on 1/28/18.
    //
    
    // keep running count of offset, if stop word: don't incrememnt and remove stopword
    // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
    //
    
    
    #include <string>
    #include <functional>
    #include <queue>
    #include <iostream>
    #include <fstream>
    
    vcday's avatar
    vcday committed
    #include "../util/Tokenizer.h"
    
    vcday's avatar
    vcday committed
    #include "../util/stringProcessing.h"
    
    vcday's avatar
    vcday committed
    #include "../shared/Document.h"
    #include "../shared/ProducerConsumerQueue.h"
    
    vcday's avatar
    vcday committed
    
    using namespace std;
    
    
    vcday's avatar
    vcday committed
    /**
     * This class uses the Doc object from the Crawler to parse the text
     * Returns a pointer to a dictionary that contains the tokenized input
     */
    
    vcday's avatar
    vcday committed
    class Parser
    
    vcday's avatar
    vcday committed
    	{
    
    vcday's avatar
    vcday committed
    
    public:
    
    
    vcday's avatar
    vcday committed
    	Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		urlFrontier = urlFrontierIn;
    		}
    
    vcday's avatar
    vcday committed
    
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parser
    	 * @return
    	 */
    
    vcday's avatar
    vcday committed
    	const unordered_map< string, vector< int>> * execute ( Document* document)
    
    vcday's avatar
    vcday committed
    		{
    		Tokenizer tokenizer;
    
    vcday's avatar
    vcday committed
    		parse ( document->DocToString (), &tokenizer );
    
    vcday's avatar
    vcday committed
    		return tokenizer.get ( );
    		}
    
    vcday's avatar
    vcday committed
    
    
    private:
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue < string >* urlFrontier;
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parses file
    	 * @param inFile
    	 * @return
    	 */
    
    vcday's avatar
    vcday committed
    	void parse ( string html, Tokenizer *tokenizer )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    		string tokenizerInput = "";
    		string currentTerm = "";
    
    vcday's avatar
    vcday committed
    		int index = 0;
    		while (index != html.size())
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			currentTerm = "";
    			while ( html.at( index ) != '\n' )
    
    vcday's avatar
    vcday committed
    				{
    
    vcday's avatar
    vcday committed
    				currentTerm += html[ index ];
    				++index;
    
    vcday's avatar
    vcday committed
    				}
    
    vcday's avatar
    vcday committed
    			++index;
    
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    			string url = extract_url ( currentTerm );
    			if (url != "")
    				{
    				urlFrontier->Push (url);
    				}
    			else
    				{
    				string title = extract_title ( currentTerm );
    				if (title != "")
    					{
    					tokenizerInput += title;
    					}
    				}
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    		tokenizer->execute ( tokenizerInput );
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    		}
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Returns a url, or "" if none
    	 * @param word
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    vcday's avatar
    vcday committed
    	string extract_url ( string word )
    
    vcday's avatar
    vcday committed
    		{
    		string url = "";
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    		if ( *findStr ( word, "<a" ) != '\0' )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			auto foundHttp = findStr ( word, "href=http" );
    
    vcday's avatar
    vcday committed
    			if ( *foundHttp != '\0' )
    
    vcday's avatar
    vcday committed
    				{
    				url = "http";
    
    vcday's avatar
    vcday committed
    				foundHttp += 9;
    
    
    vcday's avatar
    vcday committed
    				while ( *foundHttp != *findStr ( word, "\">" ) )
    
    vcday's avatar
    vcday committed
    					{
    
    vcday's avatar
    vcday committed
    					url += *foundHttp;
    					++foundHttp;
    
    vcday's avatar
    vcday committed
    					}
    				}
    			}
    
    
    vcday's avatar
    vcday committed
    		return url;
    
    vcday's avatar
    vcday committed
    		}
    
    	/**
    
    vcday's avatar
    vcday committed
    	 * Returns a title, or "" if none
    
    vcday's avatar
    vcday committed
    	 * @param word
    
    vcday's avatar
    vcday committed
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    vcday's avatar
    vcday committed
    	string extract_title ( string & word )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		string title = "";
    
    vcday's avatar
    vcday committed
    		char end = '<';
    
    vcday's avatar
    vcday committed
    		auto pos = findStr ( "<title>", word );
    
    vcday's avatar
    vcday committed
    		if ( *pos != '\0')
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			pos += 7;
    			while ( *pos != end )
    
    vcday's avatar
    vcday committed
    				{
    				title += *pos;
    
    vcday's avatar
    vcday committed
    				++pos;
    
    vcday's avatar
    vcday committed
    				}
    			}
    
    vcday's avatar
    vcday committed
    		return title;
    
    vcday's avatar
    vcday committed
    		}
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	};
    
    vcday's avatar
    vcday committed