From 2159787c29ae5fd1fa5ed39f0d9cdec45d72cc73 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Sat, 17 Mar 2018 13:31:16 -0400 Subject: [PATCH] fixed weird parsing errors --- crawler/spider.cpp | 3 +- parser/Parser.cpp | 38 ++++++++++++++--- parser/Parser.h | 13 ++++-- parser/tests/parserTest.cpp | 79 +++++++++++++++++++++++++++--------- shared/Document.cpp | 6 +++ shared/Document.h | 1 + shared/url.h | 2 +- util/Tokenizer.cpp | 43 ++++++++++++++++---- util/Tokenizer.h | 30 +++++++++++--- util/stringProcessing.cpp | 75 ++++++++++++++++++++++++++++++---- util/stringProcessing.h | 34 ++++++++++++---- util/tests/tokenizerTest.cpp | 2 +- 12 files changed, 265 insertions(+), 61 deletions(-) diff --git a/crawler/spider.cpp b/crawler/spider.cpp index fc9c565..d377fab 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -73,6 +73,7 @@ void Spider::FuncToRun() Document document ( currentUrl, reader->buffer ); auto dict = parser.execute ( &document ); + cout << "docID: " << docID << endl; for ( auto it = dict->begin( ); it != dict->end( ); it++ ) { @@ -85,7 +86,7 @@ void Spider::FuncToRun() } cout << std::endl; delete dict; - + dict = nullptr; cond = true; } else diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 99b46d4..334c123 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -16,10 +16,10 @@ Parser::Parser ( ProducerConsumerQueue< string > *urlFrontierIn ) * Executes the Parser * @return */ -const unordered_map< string, vector< int > > *Parser::execute ( Document *document ) +const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document ) { Tokenizer tokenizer; - parse( document->DocToString( ), &tokenizer ); + parse( document->DocToString( ), document->getUrl( ), &tokenizer ); return tokenizer.get( ); } @@ -28,12 +28,21 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume * @param inFile * @return */ -//TODO instead of grabbing each line, look to see if beginning of -// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found -void Parser::parse ( string html, Tokenizer *tokenizer ) +void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) { + auto htmlIt = html.begin( ); unsigned long offset = 0; + + // tokenize url + string host = ""; + host.assign( currentUrl.Host ); + string path = ""; + path.assign( currentUrl.Path ); + string url = host + "/" + path; + + tokenizer->execute( url, offset, Tokenizer::URL ); + while ( htmlIt != html.end( ) ) { // if open bracket @@ -48,7 +57,14 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) string url = extract_url( line ); if ( url != "" ) { + if ( isLocal ( url ) ) + { + string completeUrl = ""; + completeUrl.assign( currentUrl.CompleteUrl ); + url = completeUrl + url; + } urlFrontier->Push( url ); + cout << url << endl; } // check if line is title else @@ -56,7 +72,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) string title = extract_title( line ); if ( title != "" ) { - tokenizer->execute( title, offset ); + tokenizer->execute( title, offset, Tokenizer::TITLE ); } } offset = htmlIt - html.begin( ); @@ -123,3 +139,13 @@ string Parser::extract_title ( string & word ) return title; } +/** + * Will return true if local url + * + * @param url + * @return + */ +bool Parser::isLocal ( string url ) + { + return ( *url.begin( ) == '/' ); + } \ No newline at end of file diff --git a/parser/Parser.h b/parser/Parser.h index dc1f355..7916a9b 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -33,7 +33,7 @@ public: * Executes the Parser * @return */ - const unordered_map< string, vector< int> > *execute ( Document *document ); + const unordered_map< string, vector< unsigned long > > *execute ( Document *document ); private: @@ -44,9 +44,7 @@ private: * @param inFile * @return */ - //TODO instead of grabbing each line, look to see if beginning of - // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found - void parse ( string html, Tokenizer *tokenizer ); + void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ); /** @@ -64,6 +62,13 @@ private: */ string extract_title ( string & word ); + /** + * Will return true if local url + * + * @param url + * @return + */ + bool isLocal ( string url ); }; diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 7cd5f2b..bd218fd 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -17,9 +17,15 @@ void testURL ( ); int main ( ) { cout << "Testing Parser ... " << endl << endl; + cout << "Testing URL: " << endl; testURL ( ); + cout << "URL Test Passed!" << endl << endl; + cout << "Testing Simple: " << endl; testSimple( ); + cout << "Simple Test Passed!" << endl << endl; + cout << "Testing Complex: " << endl; testComplex( ); + cout << "Complex Test Passed!" << endl; cout << "Parser Tests Passed! :D" << endl; } @@ -35,17 +41,26 @@ void testSimple ( ) Parser parser( &urlFrontierTest ); auto dictionary = parser.execute( &document ); - + for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ ) + { + cout << it->first << ':'; + for ( int i = 0; i < it->second.size( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; + } assert ( dictionary != nullptr ); - assert ( dictionary->size( ) == 2 ); - assert ( dictionary->find( "cat" ) != dictionary->end( ) ); - assert ( dictionary->find( "titl" ) != dictionary->end( ) ); - assert ( dictionary->find( "this" ) == dictionary->end( ) ); - assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 ); - assert ( dictionary->at( "titl" )[ 0 ] == 1 ); + assert ( dictionary->size( ) == 4 ); + assert ( dictionary->find( "#cat" ) != dictionary->end( ) ); + assert ( dictionary->find( "$testurl" ) != dictionary->end( ) ); + assert ( dictionary->find( "#titl" ) != dictionary->end( ) ); + assert ( dictionary->find( "#this" ) == dictionary->end( ) ); + assert ( dictionary->at( "#cat" )[ 0 ] == 0 && dictionary->at( "#cat" )[ 1 ] == 2 ); + assert ( dictionary->at( "#titl" )[ 0 ] == 1 ); delete dictionary; - + dictionary = nullptr; } void testComplex ( ) @@ -60,7 +75,6 @@ void testComplex ( ) { docString += temp; } - ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" ); char *writable = new char[docString.size( ) + 1]; std::copy( docString.begin( ), docString.end( ), writable ); @@ -70,19 +84,32 @@ void testComplex ( ) Parser parser( &urlFrontierTest ); auto dictionary = parser.execute( &document ); - + for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ ) + { + cout << it->first << ':'; + for ( int i = 0; i < it->second.size( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; + } assert ( dictionary != nullptr ); - assert ( dictionary->size( ) == 3 ); + assert ( dictionary->size( ) == 8 ); - assert ( dictionary->find( "cat" ) != dictionary->end( ) ); - assert ( dictionary->find( "stori" ) != dictionary->end( ) ); - assert ( dictionary->find( "joe" ) != dictionary->end( ) ); + assert ( dictionary->find( "#cat" ) != dictionary->end( ) ); + assert ( dictionary->find( "#stori" ) != dictionary->end( ) ); + assert ( dictionary->find( "#joe" ) != dictionary->end( ) ); + assert ( dictionary->find( "$w3school" ) != dictionary->end( ) ); + assert ( dictionary->find( "$test" ) != dictionary->end( ) ); + assert ( dictionary->find( "$cat" ) != dictionary->end( ) ); - assert ( dictionary->find( "the" ) == dictionary->end( ) ); - assert ( dictionary->find( "of" ) == dictionary->end( ) ); + assert ( dictionary->find( "#the" ) == dictionary->end( ) ); + assert ( dictionary->find( "#of" ) == dictionary->end( ) ); delete dictionary; + dictionary = nullptr; delete[] writable; + writable = nullptr; } @@ -91,13 +118,27 @@ void testURL ( ) const char *line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>"; ProducerConsumerQueue< string > urlFrontierTest; - ParsedUrl url = ParsedUrl( "testurl.com" ); + ParsedUrl url = ParsedUrl( "http://testurl.com" ); char docString[10240]; strcpy( docString, line ); Document document( url, docString ); Parser parser( &urlFrontierTest ); - auto dict = parser.execute( &document ); + auto dictionary = parser.execute( &document ); + for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ ) + { + cout << it->first << ':'; + for ( int i = 0; i < it->second.size( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; + } + assert ( urlFrontierTest.Pop( ) == "http://www.bafta.org/"); - delete dict; + assert ( dictionary->find( "$bafta" ) == dictionary->end( ) ); + assert ( dictionary->find( "$testurl" ) != dictionary->end( ) ); + + delete dictionary; + dictionary = nullptr; } \ No newline at end of file diff --git a/shared/Document.cpp b/shared/Document.cpp index 54fef82..2a2f3e3 100644 --- a/shared/Document.cpp +++ b/shared/Document.cpp @@ -59,6 +59,12 @@ int Document::WriteToDocMap ( ) } +ParsedUrl Document::getUrl ( ) + { + return this->url; + } + + void Document::PrintDocMap ( string url, int location ) { pthread_mutex_lock ( &docMap_mutex ); diff --git a/shared/Document.h b/shared/Document.h index 5581d7b..5de4d87 100644 --- a/shared/Document.h +++ b/shared/Document.h @@ -44,6 +44,7 @@ public: int WriteToDocMap(); + ParsedUrl getUrl ( ); static void PrintDocMap( string url, int location ); }; \ No newline at end of file diff --git a/shared/url.h b/shared/url.h index 44fc016..91c5502 100644 --- a/shared/url.h +++ b/shared/url.h @@ -47,7 +47,7 @@ public: pathBuffer = new char[ strlen( url ) + 1 ]; char *f, *t; - for ( t = pathBuffer, f = url; *t++ = *f++; ) + for ( t = pathBuffer, f = url; ( *t++ = *f++ ); ) ; Service = pathBuffer; diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index cad3e44..dfbb9c4 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -1,12 +1,13 @@ #include "Tokenizer.h" +#include <iostream> /** * Tokenizer Cstor */ Tokenizer::Tokenizer ( ) { - docIndex = new unordered_map< string, vector< int>>; + docIndex = new unordered_map< string, vector< unsigned long > >; } /** @@ -14,7 +15,7 @@ Tokenizer::Tokenizer ( ) * * @return pointer to unordered_map< string, vector< int>> */ -unordered_map< string, vector< int>> *Tokenizer::get ( ) const +unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const { return docIndex; } @@ -23,13 +24,36 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const * Executes the Tokenizer * Sends tokens to dictionary * - * token -> [offsets] * @param originalText * @param offset + * @param decorator */ -void Tokenizer::execute ( string & originalText, unsigned long offset ) +void Tokenizer::execute ( string originalText, unsigned long offset, char decorator ) + { + // split by symbols + if ( decorator == Tokenizer::URL ) + { + vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', + '(', ')', '*', '+', ',', ';', '='}; + + tokenize( splitStr( originalText, split, true ), offset, decorator ); + } + // split by spaces + else + { + tokenize( splitStr( originalText, ' ', true ), offset, decorator ); + } + } + +/** + * Tokenizes text (titles, body text) + * + * @param originalText + * @param offset + * @param decorator + */ +void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator ) { - vector< string > splitText = splitStr( originalText, ' ', true ); string processedString = ""; for ( int i = 0; i < splitText.size( ); ++i ) { @@ -41,8 +65,13 @@ void Tokenizer::execute ( string & originalText, unsigned long offset ) { // stem word processedString = stem.execute( processedString ); - ( *docIndex )[ processedString ].push_back( offset ); - ++offset; + if ( decorator != '\0' ) + { + processedString = decorator + processedString; + } + ( *docIndex )[ processedString ].push_back( offset ); + ++offset; } } } + diff --git a/util/Tokenizer.h b/util/Tokenizer.h index ebf3b90..543f1da 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -1,5 +1,6 @@ #pragma once + #include <string> #include <unordered_map> #include <vector> @@ -13,6 +14,11 @@ class Tokenizer public: + // decorators + static const char TITLE = '#'; + static const char ANCHOR = '@'; + static const char URL = '$'; + /** * Tokenizer Cstor */ @@ -23,19 +29,31 @@ public: * * @return pointer to unordered_map< string, vector< int>> */ - unordered_map< string, vector< int>> *get ( ) const; + unordered_map< string, vector< unsigned long > > *get ( ) const; /** * Executes the Tokenizer * Sends tokens to dictionary * - * token -> [offsets] + * + * @param originalText + * @param offset + * @param decorator + */ + void execute ( string originalText, unsigned long offset, char decorator = '\0' ); + +private: + + unordered_map< string, vector< unsigned long > > *docIndex; + Stemmer stem; + + /** + * Tokenizes text (titles, body text) + * * @param originalText * @param offset + * @param decorator */ - void execute ( string &originalText, unsigned long offset ); + void tokenize ( vector< string > splitText, unsigned long offset, char decorator ); - private: - unordered_map< string, vector< int>> *docIndex; - Stemmer stem; }; diff --git a/util/stringProcessing.cpp b/util/stringProcessing.cpp index 4b9b195..8576789 100644 --- a/util/stringProcessing.cpp +++ b/util/stringProcessing.cpp @@ -5,7 +5,7 @@ #include "stringProcessing.h" #include "Stemmer.h" #include <cassert> - +#include <iostream> using namespace std; /** @@ -181,7 +181,7 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str * @param removeChars * @return vector < string > */ -vector< string > splitStr ( string & originalText, char delim , bool removeSyms) +vector< string > splitStr ( string originalText, char delim , bool removeSyms) { vector< string > splitWords; auto begin = originalText.begin( ); @@ -209,13 +209,72 @@ vector< string > splitStr ( string & originalText, char delim , bool removeSyms) } +/** + * Splits string by multiple delimiters + * + * @param originalText + * @param delims + * @param removeSyms + * @return + */ +vector< string > splitStr ( string originalText, vector < char > delims , bool removeSyms) + { + vector< string > splitWords; + char begin; + for( int i = 0; i < originalText.size( ); ++i) + { + begin = originalText[i]; + string word = ""; + while ( !inArray( begin, delims ) && i < originalText.size() ) + { + begin = originalText[i]; + if (removeSyms && ( isAlpha( begin ) || isNum( begin ) ) ) + { + word += begin; + } + ++i; + } + + if(inArray( begin, delims )) + --i; + + + if (word != "" && word != " " ) + { + splitWords.push_back( word ); + } + } + + return splitWords; + + } + +/** + * Returns true if element is in array, false otherwise + * + * @param vec + * @return + */ +template <typename T> bool inArray ( T needle, vector < T > haystack ) + { + for ( int i = 0; i < haystack.size( ); ++ i) + { + if ( haystack[ i ] == needle ) + { + return true; + } + } + return false; + } + + /** * Returns true if @word is a stopword * * @param word * @return bool */ -bool isStopWord ( string & word ) +bool isStopWord ( string word ) { return ( stopWords.find( word ) != stopWords.end( ) ); @@ -227,7 +286,7 @@ bool isStopWord ( string & word ) * @param word * @return string */ -string toLower ( string & word ) +string toLower ( string word ) { auto iter = word.begin( ); string lowerWord = ""; @@ -254,7 +313,7 @@ string toLower ( string & word ) * @param word * @return string */ -string stemWord ( string & word ) +string stemWord ( string word ) { Stemmer stemmer; word = stemmer.execute( word ); @@ -269,7 +328,7 @@ string stemWord ( string & word ) * @param len * @return string */ -string subStr ( string & word, size_t pos, size_t len ) +string subStr ( string word, size_t pos, size_t len ) { string substr = ""; for ( int i = 0; i < len; ++i ) @@ -305,7 +364,7 @@ string subStr ( string::iterator begin, string::iterator end ) * @param chars * @return string */ -string stripStr ( string & word, vector< char > chars ) +string stripStr ( string word, vector< char > chars ) { string wordStripped = ""; auto begin = word.begin( ); @@ -337,7 +396,7 @@ string stripStr ( string & word, vector< char > chars ) * @param chars * @return string */ -string stripStr ( string & word ) +string stripStr ( string word ) { string wordStripped = ""; auto begin = word.begin( ); diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 4e6de29..feca3f8 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -21,7 +21,7 @@ static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "she", "some", "the", "their", "them", "there", "they", "that", "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", - "with", + "with", "www", "you", "your" }; /** @@ -62,7 +62,25 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str * @param removeSyms * @return vector< string > */ -vector< string > splitStr ( string & originalText, char delim, bool removeSyms ); +vector< string > splitStr ( string originalText, char delim, bool removeSyms ); + +/** + * Splits string by multiple delimiters + * + * @param originalText + * @param delims + * @param removeSyms + * @return + */ +vector< string > splitStr ( string originalText, vector < char > delims, bool removeSyms ); + +/** + * Returns true if element is in array, false otherwise + * + * @param vec + * @return + */ +template <typename T> bool inArray ( T needle, vector < T > haystack ); /** * Returns true if @word is a stopword @@ -70,7 +88,7 @@ vector< string > splitStr ( string & originalText, char delim, bool removeSyms ) * @param word * @return bool */ -bool isStopWord ( string & word ); +bool isStopWord ( string word ); /** * Returns lowercase @word @@ -78,7 +96,7 @@ bool isStopWord ( string & word ); * @param word * @return string */ -string toLower ( string & word ); +string toLower ( string word ); /** * Returns stemmed @word @@ -86,7 +104,7 @@ string toLower ( string & word ); * @param word * @return string */ -string stemWord ( string & word ); +string stemWord ( string word ); /** * Returns a substring [ post, len ) @@ -96,7 +114,7 @@ string stemWord ( string & word ); * @param len * @return string */ -string subStr ( string & word, size_t pos, size_t len ); +string subStr ( string word, size_t pos, size_t len ); /** * Returns a substring [ begin, end ) @@ -114,7 +132,7 @@ string subStr ( string::iterator begin, string::iterator end ); * @param chars * @return string */ -string stripStr ( string & word, vector< char > chars ); +string stripStr ( string word, vector< char > chars ); /** * Removes all chars from word @@ -123,7 +141,7 @@ string stripStr ( string & word, vector< char > chars ); * @param word * @return string */ -string stripStr ( string & word ); +string stripStr ( string word ); /** * Returns true is character is a letter diff --git a/util/tests/tokenizerTest.cpp b/util/tests/tokenizerTest.cpp index a89e22d..235bf67 100644 --- a/util/tests/tokenizerTest.cpp +++ b/util/tests/tokenizerTest.cpp @@ -42,5 +42,5 @@ void testExecute ( string original ) cout << std::endl; } delete dict; - + dict = nullptr; } -- GitLab