Skip to content
Snippets Groups Projects
Parser.h 3.11 KiB

#pragma once

#include <string>
#include <functional>
#include <queue>
#include <iostream>
#include <fstream>
#include "../util/Tokenizer.h"
#include "../util/stringProcessing.h"
#include "../shared/Document.h"
#include "../shared/ProducerConsumerQueue.h"
#include "../crawler/Readers/StreamReader.h"
#include "../crawler/UrlFrontier.h"
using namespace std;

/**
 * This class uses the Doc object from the Crawler to parse the text
 * Returns a pointer to a dictionary that contains the tokenized input
 */
class Parser
	{

public:

	/**
	 * Parser Cstor
	 * @param urlFrontierIn
	 */
	Parser ( UrlFrontier* urlFrontierIn );

	/**
	 * Executes the Parser
	 * @return
	 */
	const unordered_map< string, vector< unsigned long > > *execute ( StreamReader *reader );


private:
	UrlFrontier *urlFrontier;

	/**
	 * Parses file
	 * @param inFile
	 * @return
	 */
	void parse ( StreamReader *reader, Tokenizer *tokenizer );

	/**
	 * Returns anchor text if found
	 * @param html 
	 * @return 
	 */
	string extractAnchorText ( string html );

	/**
	 * Returns true if no closing tag & should ignore
	 * @param html
	 * @param htmlIt
	 * @return
	 */
	bool isInvalidTag( string html, unsigned long htmlIt );

	/**
	 * Returns a url, or "" if none
	 * @param html
	 * @return
	 */
	string extractUrl ( string html );

	/**
	 * Returns a title, or "" if none
	 * @param html
	 * @return
	 */
	string extractTitle ( string html );

	/**
	 * Will return true if local url
	 *
	 * @param url
	 * @param currentUrl
	 * @return
	 */
	string isLocal ( string url, ParsedUrl* currentUrl );

	/**
	 * Returns true is url is valid
	 *
	 * @param url
	 * @return
	 */
	bool isValid ( string url );

	/**
	 * Sends to Url Frontier
	 * 
	 * @param url 
	 * @param currentUrl 
	 * @param anchorText 
	 * @param debug --> will print urls to std::cout
	 */
	void pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorText, bool debug );

	/**
	 * Returns true if given tag
	 *
	 * @param html
	 * @param tag
	 * @return
	 */
	bool isTag ( string html, string tag );

	/**
	 * Extracts the paragraph text
	 *
	 * @param html
	 * @param offsetTitle
	 * @param offsetBody
	 * @param isParagraph
	 * @param tokenizer
	 * @param currentUrl
	 * @return
	 */
	string extractBody ( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph,
	                      Tokenizer *tokenizer,
	                      ParsedUrl * currentUrl );


	/**
	 * Extracts all text in html
	 *
	 * @param line
	 * @param offsetTitle
	 * @param offsetBody
	 * @param isParagraph
	 * @param tokenizer
	 * @param currentUrl
	 */
	void extractAll ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph,
	                   Tokenizer *tokenizer,
	                   ParsedUrl * currentUrl );

	//TODO delete?? may not need
	/**
	 * Removes given html tags
	 *
	 * @param html
	 * @param htmlIt
	 * @param savePosition
	 * @param tag
	 */
	void removeTag ( string & html, unsigned long & htmlIt, unsigned long savePosition, string tag );

	/**
	 * Extracts the header tags and adds to body
	 * @param html
	 * @return
	 */
	string extractHeader ( string html );

	};