Skip to content
Snippets Groups Projects
Parser.h 2.83 KiB
Newer Older
  • Learn to ignore specific revisions
  • vcday's avatar
    vcday committed
    //
    // Created by Veronica Day on 1/28/18.
    //
    
    // keep running count of offset, if stop word: don't incrememnt and remove stopword
    // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
    //
    
    
    #include <string>
    #include <functional>
    #include <queue>
    #include <iostream>
    #include <fstream>
    
    vcday's avatar
    vcday committed
    #include "../util/Tokenizer.h"
    
    vcday's avatar
    vcday committed
    #include "../util/stringProcessing.h"
    
    vcday's avatar
    vcday committed
    #include "../shared/Document.h"
    #include "../shared/ProducerConsumerQueue.h"
    
    vcday's avatar
    vcday committed
    
    using namespace std;
    
    
    vcday's avatar
    vcday committed
    /**
     * This class uses the Doc object from the Crawler to parse the text
     * Returns a pointer to a dictionary that contains the tokenized input
     */
    
    vcday's avatar
    vcday committed
    class Parser
    
    vcday's avatar
    vcday committed
    	{
    
    vcday's avatar
    vcday committed
    
    public:
    
    
    vcday's avatar
    vcday committed
    	Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		urlFrontier = urlFrontierIn;
    		}
    
    vcday's avatar
    vcday committed
    
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parser
    	 * @return
    	 */
    
    vcday's avatar
    vcday committed
    	const unordered_map< string, vector< int>> * execute ( Document* document)
    
    vcday's avatar
    vcday committed
    		{
    		Tokenizer tokenizer;
    
    vcday's avatar
    vcday committed
    		parse ( document->DocToString (), &tokenizer );
    
    vcday's avatar
    vcday committed
    		return tokenizer.get ( );
    		}
    
    vcday's avatar
    vcday committed
    
    
    private:
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue < string >* urlFrontier;
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parses file
    	 * @param inFile
    	 * @return
    	 */
    
    vcday's avatar
    vcday committed
    	//TODO instead of grabbing each line, look to see if beginning of
    	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
    
    vcday's avatar
    vcday committed
    	void parse ( string html, Tokenizer *tokenizer )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		auto htmlIt = html.begin();
    		int offset = 0;
    		while (htmlIt != html.end())
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			// if open bracket
    			if ( *htmlIt == '<' )
    
    vcday's avatar
    vcday committed
    				{
    
    vcday's avatar
    vcday committed
    				auto begCloseTag = findNext ("</", htmlIt);
    				auto endCloseTag = findNext ( ">", begCloseTag);
    				string line (htmlIt, endCloseTag + 1);
    				htmlIt = endCloseTag + 2;
    
    				// check if line is url
    				string url = extract_url ( line );
    				if (url != "")
    					{
    					urlFrontier->Push ( url );
    					}
    				// check if line is title
    				else
    					{
    					string title = extract_title ( line );
    					if (title != "")
    						{
    						tokenizer->execute ( title, offset );
    						}
    					}
    				//TODO fix offset?
    				offset = htmlIt - html.begin();
    
    vcday's avatar
    vcday committed
    				}
    			else
    				{
    
    vcday's avatar
    vcday committed
    				++htmlIt;
    
    vcday's avatar
    vcday committed
    				}
    			}
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    		}
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Returns a url, or "" if none
    	 * @param word
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    vcday's avatar
    vcday committed
    	string extract_url ( string word )
    
    vcday's avatar
    vcday committed
    		{
    		string url = "";
    
    vcday's avatar
    vcday committed
    		if ( *findStr ( "<a", word ) != '\0' )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			auto foundHref = findStr ( "href", word );
    			auto foundHttp = findNext ( "http", foundHref );
    
    vcday's avatar
    vcday committed
    			if ( *foundHttp != '\0' )
    
    vcday's avatar
    vcday committed
    				{
    
    vcday's avatar
    vcday committed
    				url = "";
    				auto closeTag = findNext ( ">", word.begin ( ) );
    				while ( *foundHttp != *closeTag )
    
    vcday's avatar
    vcday committed
    					{
    
    vcday's avatar
    vcday committed
    					url += *foundHttp;
    					++foundHttp;
    
    vcday's avatar
    vcday committed
    					}
    				}
    			}
    
    
    vcday's avatar
    vcday committed
    		return url;
    
    vcday's avatar
    vcday committed
    		}
    
    	/**
    
    vcday's avatar
    vcday committed
    	 * Returns a title, or "" if none
    
    vcday's avatar
    vcday committed
    	 * @param word
    
    vcday's avatar
    vcday committed
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    vcday's avatar
    vcday committed
    	string extract_title ( string & word )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		string title = "";
    
    vcday's avatar
    vcday committed
    		char end = '<';
    
    vcday's avatar
    vcday committed
    		auto pos = findStr ( "<title>", word );
    
    vcday's avatar
    vcday committed
    		if ( *pos != '\0')
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			pos += 7;
    			while ( *pos != end )
    
    vcday's avatar
    vcday committed
    				{
    				title += *pos;
    
    vcday's avatar
    vcday committed
    				++pos;
    
    vcday's avatar
    vcday committed
    				}
    			}
    
    vcday's avatar
    vcday committed
    		return title;
    
    vcday's avatar
    vcday committed
    		}
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	};
    
    vcday's avatar
    vcday committed