From 06bbce9da352feb7dfd63c74879f65e59903e1a8 Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Sat, 17 Mar 2018 21:26:16 -0400
Subject: [PATCH] fixed offsets

---
 CMakeLists.txt           | 4 ++--
 crawler/SocketReader.cpp | 1 +
 parser/Parser.cpp        | 8 ++++----
 util/Tokenizer.cpp       | 9 +++++----
 util/Tokenizer.h         | 4 ++--
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5e04fc..721b631 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp)
 
 add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp)
 
-#find_package(OpenSSL REQUIRED)
+find_package(OpenSSL REQUIRED)
 
-#target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
+target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp
index 56fe0c2..172f64b 100644
--- a/crawler/SocketReader.cpp
+++ b/crawler/SocketReader.cpp
@@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s )
 		front = temp + strlen(http_buff);
 		delete[] http_buff;
 		http_buff = temp;
+
 		}
 
 	return http_buff;
diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 334c123..56d9646 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	{
 
 	auto htmlIt = html.begin( );
-	unsigned long offset = 0;
+	unsigned long offsetTitle = 0;
+	unsigned long offsetURL = 0;
 
 	// tokenize url
 	string host = "";
@@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	path.assign( currentUrl.Path );
 	string url = host + "/" + path;
 
-	tokenizer->execute( url, offset, Tokenizer::URL );
+	offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
 
 	while ( htmlIt != html.end( ) )
 		{
@@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 				string title = extract_title( line );
 				if ( title != "" )
 					{
-					tokenizer->execute( title, offset, Tokenizer::TITLE );
+					offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
 					}
 				}
-			offset = htmlIt - html.begin( );
 			}
 		else
 			{
diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp
index dfbb9c4..d8553e4 100644
--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
  * @param offset
  * @param decorator
  */
-void Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
+unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
 	{
 	// split by symbols
 	if ( decorator == Tokenizer::URL )
@@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
 		vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
 		                          '(', ')', '*', '+', ',', ';', '='};
 
-		tokenize( splitStr( originalText, split, true ), offset, decorator );
+		return tokenize( splitStr( originalText, split, true ), offset, decorator );
 		}
 	// split by spaces
 	else
 		{
-		tokenize( splitStr( originalText, ' ', true ), offset, decorator );
+		return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
 		}
 	}
 
@@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
  * @param offset
  * @param decorator
  */
-void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
+unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
 	{
 	string processedString = "";
 	for ( int i = 0; i < splitText.size( ); ++i )
@@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch
 				++offset;
 			}
 		}
+	return offset;
 	}
 
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index 543f1da..6a8c373 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -40,7 +40,7 @@ public:
 	 * @param offset
 	 * @param decorator
 	 */
-	void execute ( string originalText, unsigned long offset, char decorator = '\0' );
+	unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
 
 private:
 
@@ -54,6 +54,6 @@ private:
 	 * @param offset
 	 * @param decorator
 	 */
-	void tokenize ( vector< string > splitText, unsigned long offset, char decorator );
+	unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
 
 	};
-- 
GitLab