fixed offsets

06bbce9d · vcday · 76bf3f83 · 06bbce9d · 06bbce9d · 06bbce9d
Commit 06bbce9d authored 7 years ago by vcday
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp)
 add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp)
-#find_package(OpenSSL REQUIRED)
+find_package(OpenSSL REQUIRED)
-#target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
+target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
--- a/crawler/SocketReader.cpp
+++ b/crawler/SocketReader.cpp
@@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s )
 		front = temp + strlen(http_buff);
 		delete[] http_buff;
 		http_buff = temp;
 		}
 	return http_buff;

--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	{
 	auto htmlIt = html.begin( );
-	unsigned long offset = 0;
+	unsigned long offsetTitle = 0;
+	unsigned long offsetURL = 0;
 	// tokenize url
 	string host = "";
@@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	path.assign( currentUrl.Path );
 	string url = host + "/" + path;
-	tokenizer->execute( url, offset, Tokenizer::URL );
+	offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
 	while ( htmlIt != html.end( ) )
 		{
@@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 				string title = extract_title( line );
 				if ( title != "" )
 					{
-					tokenizer->execute( title, offset, Tokenizer::TITLE );
+					offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
 					}
 				}
-			offset = htmlIt - html.begin( );
 			}
 		else
 			{

--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
 * @param offset
 * @param decorator
 */
-void Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
+unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
 	{
 	// split by symbols
 	if ( decorator == Tokenizer::URL )
@@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
 		vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
 		                          '(', ')', '*', '+', ',', ';', '='};
-		tokenize( splitStr( originalText, split, true ), offset, decorator );
+		return tokenize( splitStr( originalText, split, true ), offset, decorator );
 		}
 	// split by spaces
 	else
 		{
-		tokenize( splitStr( originalText, ' ', true ), offset, decorator );
+		return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
 		}
 	}
@@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
 * @param offset
 * @param decorator
 */
-void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
+unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
 	{
 	string processedString = "";
 	for ( int i = 0; i < splitText.size( ); ++i )
@@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch
 				++offset;
 			}
 		}
+	return offset;
 	}
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -40,7 +40,7 @@ public:
 	 * @param offset
 	 * @param decorator
 	 */
-	void execute ( string originalText, unsigned long offset, char decorator = '\0' );
+	unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
 private:
@@ -54,6 +54,6 @@ private:
 	 * @param offset
 	 * @param decorator
 	 */
-	void tokenize ( vector< string > splitText, unsigned long offset, char decorator );
+	unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
 	};