From 06bbce9da352feb7dfd63c74879f65e59903e1a8 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Sat, 17 Mar 2018 21:26:16 -0400 Subject: [PATCH] fixed offsets --- CMakeLists.txt | 4 ++-- crawler/SocketReader.cpp | 1 + parser/Parser.cpp | 8 ++++---- util/Tokenizer.cpp | 9 +++++---- util/Tokenizer.h | 4 ++-- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5e04fc..721b631 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp) add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp) -#find_package(OpenSSL REQUIRED) +find_package(OpenSSL REQUIRED) -#target_link_libraries(crawler-parser-test OpenSSL::SSL pthread) +target_link_libraries(crawler-parser-test OpenSSL::SSL pthread) diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp index 56fe0c2..172f64b 100644 --- a/crawler/SocketReader.cpp +++ b/crawler/SocketReader.cpp @@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s ) front = temp + strlen(http_buff); delete[] http_buff; http_buff = temp; + } return http_buff; diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 334c123..56d9646 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) { auto htmlIt = html.begin( ); - unsigned long offset = 0; + unsigned long offsetTitle = 0; + unsigned long offsetURL = 0; // tokenize url string host = ""; @@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) path.assign( currentUrl.Path ); string url = host + "/" + path; - tokenizer->execute( url, offset, Tokenizer::URL ); + offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL ); while ( htmlIt != html.end( ) ) { @@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) string title = extract_title( line ); if ( title != "" ) { - tokenizer->execute( title, offset, Tokenizer::TITLE ); + offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE ); } } - offset = htmlIt - html.begin( ); } else { diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index dfbb9c4..d8553e4 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const * @param offset * @param decorator */ -void Tokenizer::execute ( string originalText, unsigned long offset, char decorator ) +unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator ) { // split by symbols if ( decorator == Tokenizer::URL ) @@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='}; - tokenize( splitStr( originalText, split, true ), offset, decorator ); + return tokenize( splitStr( originalText, split, true ), offset, decorator ); } // split by spaces else { - tokenize( splitStr( originalText, ' ', true ), offset, decorator ); + return tokenize( splitStr( originalText, ' ', true ), offset, decorator ); } } @@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora * @param offset * @param decorator */ -void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator ) +unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator ) { string processedString = ""; for ( int i = 0; i < splitText.size( ); ++i ) @@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch ++offset; } } + return offset; } diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 543f1da..6a8c373 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -40,7 +40,7 @@ public: * @param offset * @param decorator */ - void execute ( string originalText, unsigned long offset, char decorator = '\0' ); + unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' ); private: @@ -54,6 +54,6 @@ private: * @param offset * @param decorator */ - void tokenize ( vector< string > splitText, unsigned long offset, char decorator ); + unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator ); }; -- GitLab