diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 94d50acdb257472386c9355a18774ff3d0342155..fc9c565dbd4a25860577f1d5d3e05e8575a3371f 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -71,10 +71,9 @@ void Spider::FuncToRun() string pathToDisk = localPath + "/crawlerOutput/" + to_string(docID)+ ".txt"; int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk); - - Document document ( currentUrl, reader->buffer ); auto dict = parser.execute ( &document ); + cout << "docID: " << docID << endl; for ( auto it = dict->begin( ); it != dict->end( ); it++ ) { cout << it->first << " : "; @@ -84,6 +83,8 @@ void Spider::FuncToRun() } cout << std::endl; } + cout << std::endl; + delete dict; cond = true; } diff --git a/parser/Parser.cpp b/parser/Parser.cpp index e0425c0e841cdfa62ecbe9a146df181e44fabdb1..1df6081b473ab5a24bbfb1019ecf6bec958430b5 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -33,7 +33,7 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume void Parser::parse ( string html, Tokenizer *tokenizer ) { auto htmlIt = html.begin( ); - int offset = 0; + unsigned long offset = 0; while ( htmlIt != html.end( ) ) { // if open bracket @@ -49,6 +49,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) if ( url != "" ) { urlFrontier->Push( url ); + cout << url << endl; } // check if line is title else @@ -59,7 +60,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) tokenizer->execute( title, offset ); } } - //TODO fix offset? offset = htmlIt - html.begin( ); } else @@ -86,7 +86,11 @@ string Parser::extract_url ( string & word ) if ( *foundHttp != '\0' ) { url = ""; - auto closeTag = findNext( ">", word.begin( ) ); + auto closeTag = findNext( ">", foundHref ); + if ( *closeTag != '\0' && *( closeTag - 1 ) == '\"' ) + { + closeTag -= 1; + } while ( *foundHttp != *closeTag ) { url += *foundHttp; diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 542bfbe4c6fdea7309fc9bcfad55a62884e5754d..48ec8c50031e517e988f684d93aaf0502149372b 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -9,13 +9,37 @@ using namespace std; void testSimple ( ); + void testComplex ( ); + int main ( ) { cout << "Testing Parser ... " << endl << endl; - testSimple (); - testComplex (); + + const char * line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>"; + + ProducerConsumerQueue< string > urlFrontierTest; + ParsedUrl url = ParsedUrl( "testurl.com" ); + char docString[10240]; + strcpy( docString, line ); + Document document( url, docString ); + + Parser parser( &urlFrontierTest ); + auto dict = parser.execute( &document ); + + for ( auto it = dict->begin( ); it != dict->end( ); it++ ) + { + cout << it->first << ':'; + for ( int i = 0; i < it->second.size( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; + } + +// testSimple( ); +// testComplex( ); cout << "Parser Tests Passed! :D" << endl; } @@ -23,60 +47,61 @@ int main ( ) void testSimple ( ) { - ProducerConsumerQueue < string > urlFrontierTest; - ParsedUrl url = ParsedUrl("testurl.com"); + ProducerConsumerQueue< string > urlFrontierTest; + ParsedUrl url = ParsedUrl( "testurl.com" ); char docString[10240]; - strcpy(docString, "<title>This Cat Title Cat</title>"); - Document document ( url, docString); + strcpy( docString, "<title>This Cat Title Cat</title>" ); + Document document( url, docString ); - Parser parser ( &urlFrontierTest ); - auto dictionary = parser.execute ( &document ); + Parser parser( &urlFrontierTest ); + auto dictionary = parser.execute( &document ); assert ( dictionary != nullptr ); - assert ( dictionary->size () == 2); - assert ( dictionary->find ( "cat" ) != dictionary->end () ); - assert ( dictionary->find ( "titl" ) != dictionary->end () ); - assert ( dictionary->find ( "this" ) == dictionary->end () ); - assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 ); - assert ( dictionary->at ( "titl" )[ 0 ] == 1 ); + assert ( dictionary->size( ) == 2 ); + assert ( dictionary->find( "cat" ) != dictionary->end( ) ); + assert ( dictionary->find( "titl" ) != dictionary->end( ) ); + assert ( dictionary->find( "this" ) == dictionary->end( ) ); + assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 ); + assert ( dictionary->at( "titl" )[ 0 ] == 1 ); delete dictionary; } + void testComplex ( ) { - ProducerConsumerQueue < string > urlFrontierTest; - ifstream file("../tests/cats.html"); + ProducerConsumerQueue< string > urlFrontierTest; + ifstream file( "../tests/cats.html" ); string temp; string docString = "<title>Joe the Cat</title>\n"; docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n"; - while ( std::getline ( file, temp ) ) + while ( std::getline( file, temp ) ) { docString += temp; } - ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html"); - char * writable = new char[docString.size( ) + 1]; - std::copy(docString.begin( ), docString.end( ), writable); + ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" ); + char *writable = new char[docString.size( ) + 1]; + std::copy( docString.begin( ), docString.end( ), writable ); writable[ docString.size( ) ] = '\0'; - Document document ( url, writable ); + Document document( url, writable ); - Parser parser ( &urlFrontierTest ); - auto dictionary = parser.execute ( &document ); + Parser parser( &urlFrontierTest ); + auto dictionary = parser.execute( &document ); assert ( dictionary != nullptr ); - assert ( dictionary->size () == 3); + assert ( dictionary->size( ) == 3 ); - assert ( dictionary->find ( "cat" ) != dictionary->end () ); - assert ( dictionary->find ( "stori" ) != dictionary->end () ); - assert ( dictionary->find ( "joe" ) != dictionary->end () ); + assert ( dictionary->find( "cat" ) != dictionary->end( ) ); + assert ( dictionary->find( "stori" ) != dictionary->end( ) ); + assert ( dictionary->find( "joe" ) != dictionary->end( ) ); - assert ( dictionary->find ( "the" ) == dictionary->end () ); - assert ( dictionary->find ( "of" ) == dictionary->end () ); + assert ( dictionary->find( "the" ) == dictionary->end( ) ); + assert ( dictionary->find( "of" ) == dictionary->end( ) ); delete dictionary; delete[] writable; - } \ No newline at end of file + } diff --git a/tests/crawlerOutput_1.txt b/tests/crawlerOutput_1.txt new file mode 100755 index 0000000000000000000000000000000000000000..91a902c518ec298d489c7336b154a015c08eca84 Binary files /dev/null and b/tests/crawlerOutput_1.txt differ diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index 84a815792485bd589e800ff152b08966bda55f32..cad3e441fb88f0a62b6a71a84f6b0dd8bbf307b6 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -27,22 +27,20 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const * @param originalText * @param offset */ -void Tokenizer::execute ( string & originalText, int offset ) +void Tokenizer::execute ( string & originalText, unsigned long offset ) { - vector< string > splitText = splitStr( originalText, ' ' ); + vector< string > splitText = splitStr( originalText, ' ', true ); string processedString = ""; for ( int i = 0; i < splitText.size( ); ++i ) { // case fold processedString = toLower( splitText[ i ] ); //strip all characters - processedString = stripStr( processedString ); if ( !isStopWord( processedString ) ) { // stem word - //FIXME -// processedString = stem.execute( processedString ); + processedString = stem.execute( processedString ); ( *docIndex )[ processedString ].push_back( offset ); ++offset; } diff --git a/util/Tokenizer.h b/util/Tokenizer.h index bccbcfaf23c7c9f19de168550ff57052696caa53..ebf3b9029abd065e2491731ee7782c8d39989791 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -33,7 +33,7 @@ public: * @param originalText * @param offset */ - void execute ( string &originalText, int offset ); + void execute ( string &originalText, unsigned long offset ); private: unordered_map< string, vector< int>> *docIndex; diff --git a/util/stringProcessing.cpp b/util/stringProcessing.cpp index 391dd0b4fd1935fa8c1413b51aa3b2e2ff0b6416..4b9b19555c067224688ca181707767b5891d4fae 100644 --- a/util/stringProcessing.cpp +++ b/util/stringProcessing.cpp @@ -174,12 +174,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str /** * Returns a vector of strings from @originalText, split by @delim + * Will remove symbols if bool is set * * @param originalText * @param delim + * @param removeChars * @return vector < string > */ -vector< string > splitStr ( string & originalText, char delim ) +vector< string > splitStr ( string & originalText, char delim , bool removeSyms) { vector< string > splitWords; auto begin = originalText.begin( ); @@ -189,7 +191,10 @@ vector< string > splitStr ( string & originalText, char delim ) string word = ""; while ( *begin != delim && *begin != '\0' ) { - word += *begin; + if (removeSyms && ( isAlpha( *begin ) || isNum( *begin ) ) ) + { + word += *begin; + } ++begin; } diff --git a/util/stringProcessing.h b/util/stringProcessing.h index c3e6a7c10eb9a0cae3060761c8e99b215af8100c..4e6de29b79d3b859dc7cd9dbae77c8c90d514e2b 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -55,12 +55,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str /** * Returns a vector of strings from @originalText, split by @delim + * Will remove symbols if bool is set * * @param originalText * @param delim + * @param removeSyms * @return vector< string > */ -vector< string > splitStr ( string & originalText, char delim ); +vector< string > splitStr ( string & originalText, char delim, bool removeSyms ); /** * Returns true if @word is a stopword diff --git a/util/tests/stringProcessingTest.cpp b/util/tests/stringProcessingTest.cpp index c2fb6a34d2c4afaeeb1a72d543717cd686b72dfb..c22c279b1b689dd7c8cd1f34ebdae7b4b0a2fa9a 100644 --- a/util/tests/stringProcessingTest.cpp +++ b/util/tests/stringProcessingTest.cpp @@ -136,11 +136,11 @@ void testSplitStr ( string original ) { cout << "Testing splitStr..." << endl; - vector< string > vec = splitStr( original, ' ' ); + vector< string > vec = splitStr( original, ' ', true); assert( vec.size( ) == 53 ); string word = "hello\ngoodbye"; - vec = splitStr( word, '\n' ); + vec = splitStr( word, '\n', true ); assert( vec.size( ) == 2 ); assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );