diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 2cfbd525551fe982a06b412778ab15d2786e01e8..19a854295152b4b7abaebbfddc3cb82bf4e46f5b 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -7,12 +7,15 @@ * @param inFile * @return */ -//TODO instead of grabbing each line, look to see if beginning of // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found -// TODO have to read input in as a stream of chars eventually - cat into string? // TODO different counts: frequency, total num unique words, etc -// TODO handle bad html style (ie no closing p tag) //TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct) +/* + * Anchor text = # + * Title = * + * Url = @ + * Body = % + */ void Parser::parse ( string html, Tokenizer *tokenizer ) { @@ -63,17 +66,13 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) ++htmlIt; } } - - } -/** - * Returns a url, or "" if none - * @param word - * @return - */ -bool Parser::isScript ( string word ) +/* + * Returns true if script tag, false if not +*/ +bool Parser::isScript ( string & word ) { if ( *findStr ( "<script", word ) != '\0' ) { @@ -81,8 +80,11 @@ bool Parser::isScript ( string word ) } return false; } - -string Parser::extract_body( string word ) +/* + * Returns body text if p tags, empty string if not + * If there's no closing tag, stops at the first opening tag or when it hits end of file +*/ +string Parser::extract_body( string & word, int & offset ) { string body = ""; auto foundBody = findStr("<p", word) != '\0'; @@ -91,11 +93,20 @@ string Parser::extract_body( string word ) while ( *findStr != '<' ) { body += *findStr; + if ( *findStr == ' ') + { + count += 1; + } } } return body; } +/** + * Returns a url, or "" if none + * @param word + * @return + */ string Parser::extract_url ( string & word ) { diff --git a/parser/Parser.h b/parser/Parser.h index da60d764f0fe07690dfb8983cf4af45c65919489..c19fff2c4e47f9e1c8da325d77fa47123c5a1e55 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -32,6 +32,7 @@ public: * Parser * @return */ + // TODO need to change vector type to word data, change where struct is declared const unordered_map< string, vector< int>> * execute ( Document* document) { Tokenizer tokenizer; @@ -48,8 +49,6 @@ private: * @param inFile * @return */ - //TODO instead of grabbing each line, look to see if beginning of - // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found void parse ( string html, Tokenizer *tokenizer ); @@ -68,6 +67,8 @@ private: */ string extract_title ( string & word ); + bool isScript ( string & word ); + string extract_body( string & word ); }; diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 931448b99b4ad833e2d03dfdb45bffafd61a93f7..bf867b5003a6d79600517b9b4b3ddd1b6c4bc3ae 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -27,21 +27,29 @@ public: return docIndex; } //add type of word parameter, ie paragraph, url etc - void execute ( string originalText, int offset ) - { + void execute ( string & originalText, int offset ) + { vector< string > splitText = splitStr ( originalText, ' ' ); - string lowerString = ""; + string processedString = ""; int vectorLength = 0; - for ( int i = 0; i < splitText.size ( ); ++i ) - { - lowerString = toLower ( splitText[ i ] ); - if ( !isStopWord ( lowerString ) ) + for ( int i = 0; i < splitText.size( ); ++i ) + { + // case fold + processedString = toLower( splitText[ i ] ); + //strip all characters + processedString = stripStr( processedString ); + + if ( !isStopWord ( lowerString ) ) { - wordData currentWord; + // stem word + processedString = stem.execute( processedString ); + + wordData currentWord; currentWord.offset = offset; vectorLength = ( *docIndex )[ lowerString ].size( ); ( *docIndex )[ lowerString ].push_back ( currentWord ); + //incrementing frequency value of the current word ( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1; ++offset; }