Skip to content
Snippets Groups Projects
Parser.h 1.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • vcday's avatar
    vcday committed
    
    
    #pragma once
    
    vcday's avatar
    vcday committed
    #include <string>
    #include <functional>
    #include <queue>
    #include <iostream>
    #include <fstream>
    
    vcday's avatar
    vcday committed
    #include "../util/Tokenizer.h"
    
    vcday's avatar
    vcday committed
    #include "../util/stringProcessing.h"
    
    vcday's avatar
    vcday committed
    #include "../shared/Document.h"
    #include "../shared/ProducerConsumerQueue.h"
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    using namespace std;
    
    
    vcday's avatar
    vcday committed
    /**
     * This class uses the Doc object from the Crawler to parse the text
     * Returns a pointer to a dictionary that contains the tokenized input
     */
    
    vcday's avatar
    vcday committed
    class Parser
    
    vcday's avatar
    vcday committed
    	{
    
    vcday's avatar
    vcday committed
    
    public:
    
    
    vcday's avatar
    vcday committed
    	Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		urlFrontier = urlFrontierIn;
    		}
    
    vcday's avatar
    vcday committed
    
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parser
    	 * @return
    	 */
    
    aanvi's avatar
    aanvi committed
    	// TODO need to change vector type to word data, change where struct is declared
    
    vcday's avatar
    vcday committed
    	const unordered_map< string, vector< int>> * execute ( Document* document)
    
    vcday's avatar
    vcday committed
    		{
    		Tokenizer tokenizer;
    
    vcday's avatar
    vcday committed
    		parse ( document->DocToString (), &tokenizer );
    
    vcday's avatar
    vcday committed
    		return tokenizer.get ( );
    		}
    
    vcday's avatar
    vcday committed
    
    
    private:
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue < string >* urlFrontier;
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	/**
    	 * Parses file
    	 * @param inFile
    	 * @return
    	 */
    
    	void parse ( string html, Tokenizer *tokenizer );
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    
    	/**
    	 * Returns a url, or "" if none
    	 * @param word
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    vcday's avatar
    vcday committed
    	string extract_url ( string & word );
    
    vcday's avatar
    vcday committed
    
    	/**
    
    vcday's avatar
    vcday committed
    	 * Returns a title, or "" if none
    
    vcday's avatar
    vcday committed
    	 * @param word
    
    vcday's avatar
    vcday committed
    	 * @return
    
    vcday's avatar
    vcday committed
    	 */
    
    	string extract_title ( string & word );
    
    
    aanvi's avatar
    aanvi committed
    	bool isScript ( string & word );
    
    vcday's avatar
    vcday committed
    
    
    aanvi's avatar
    aanvi committed
    	string extract_body( string & word );
    
    vcday's avatar
    vcday committed
    	};
    
    vcday's avatar
    vcday committed