From 2159787c29ae5fd1fa5ed39f0d9cdec45d72cc73 Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Sat, 17 Mar 2018 13:31:16 -0400
Subject: [PATCH] fixed weird parsing errors

---
 crawler/spider.cpp           |  3 +-
 parser/Parser.cpp            | 38 ++++++++++++++---
 parser/Parser.h              | 13 ++++--
 parser/tests/parserTest.cpp  | 79 +++++++++++++++++++++++++++---------
 shared/Document.cpp          |  6 +++
 shared/Document.h            |  1 +
 shared/url.h                 |  2 +-
 util/Tokenizer.cpp           | 43 ++++++++++++++++----
 util/Tokenizer.h             | 30 +++++++++++---
 util/stringProcessing.cpp    | 75 ++++++++++++++++++++++++++++++----
 util/stringProcessing.h      | 34 ++++++++++++----
 util/tests/tokenizerTest.cpp |  2 +-
 12 files changed, 265 insertions(+), 61 deletions(-)

diff --git a/crawler/spider.cpp b/crawler/spider.cpp
index fc9c565..d377fab 100644
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -73,6 +73,7 @@ void Spider::FuncToRun()
 
 				Document document ( currentUrl, reader->buffer );
 				auto dict = parser.execute ( &document );
+				
 				cout << "docID: " << docID << endl;
 				for ( auto it = dict->begin( ); it != dict->end( ); it++ )
 					{
@@ -85,7 +86,7 @@ void Spider::FuncToRun()
 					}
 				cout << std::endl;
 				delete dict;
-
+				dict = nullptr;
 				cond = true;
 				}
 			else
diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 99b46d4..334c123 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -16,10 +16,10 @@ Parser::Parser ( ProducerConsumerQueue< string > *urlFrontierIn )
  * Executes the Parser
  * @return
  */
-const unordered_map< string, vector< int > > *Parser::execute ( Document *document )
+const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document )
 	{
 	Tokenizer tokenizer;
-	parse( document->DocToString( ), &tokenizer );
+	parse( document->DocToString( ), document->getUrl( ), &tokenizer );
 	return tokenizer.get( );
 	}
 
@@ -28,12 +28,21 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume
  * @param inFile
  * @return
  */
-//TODO instead of grabbing each line, look to see if beginning of
-// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
-void Parser::parse ( string html, Tokenizer *tokenizer )
+void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	{
+
 	auto htmlIt = html.begin( );
 	unsigned long offset = 0;
+
+	// tokenize url
+	string host = "";
+	host.assign( currentUrl.Host );
+	string path = "";
+	path.assign( currentUrl.Path );
+	string url = host + "/" + path;
+
+	tokenizer->execute( url, offset, Tokenizer::URL );
+
 	while ( htmlIt != html.end( ) )
 		{
 		// if open bracket
@@ -48,7 +57,14 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 			string url = extract_url( line );
 			if ( url != "" )
 				{
+				if ( isLocal ( url ) )
+					{
+					string completeUrl = "";
+					completeUrl.assign( currentUrl.CompleteUrl );
+					url = completeUrl + url;
+					}
 				urlFrontier->Push( url );
+				cout << url << endl;
 				}
 				// check if line is title
 			else
@@ -56,7 +72,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 				string title = extract_title( line );
 				if ( title != "" )
 					{
-					tokenizer->execute( title, offset );
+					tokenizer->execute( title, offset, Tokenizer::TITLE );
 					}
 				}
 			offset = htmlIt - html.begin( );
@@ -123,3 +139,13 @@ string Parser::extract_title ( string & word )
 	return title;
 	}
 
+/**
+ * Will return true if local url
+ *
+ * @param url
+ * @return
+ */
+bool Parser::isLocal ( string url )
+	{
+	return ( *url.begin( ) == '/' );
+	}
\ No newline at end of file
diff --git a/parser/Parser.h b/parser/Parser.h
index dc1f355..7916a9b 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -33,7 +33,7 @@ public:
 	 * Executes the Parser
 	 * @return
 	 */
-	const unordered_map< string, vector< int> > *execute ( Document *document );
+	const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
 
 
 private:
@@ -44,9 +44,7 @@ private:
 	 * @param inFile
 	 * @return
 	 */
-	//TODO instead of grabbing each line, look to see if beginning of
-	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
-	void parse ( string html, Tokenizer *tokenizer );
+	void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
 
 
 	/**
@@ -64,6 +62,13 @@ private:
 	 */
 	string extract_title ( string & word );
 
+	/**
+	 * Will return true if local url
+	 *
+	 * @param url
+	 * @return
+	 */
+	bool isLocal ( string url );
 
 	};
 
diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp
index 7cd5f2b..bd218fd 100644
--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
@@ -17,9 +17,15 @@ void testURL ( );
 int main ( )
 	{
 	cout << "Testing Parser ... " << endl << endl;
+	cout << "Testing URL: " << endl;
 	testURL ( );
+	cout << "URL Test Passed!" << endl << endl;
+	cout << "Testing Simple: " << endl;
 	testSimple( );
+	cout << "Simple Test Passed!" << endl << endl;
+	cout << "Testing Complex: " << endl;
 	testComplex( );
+	cout << "Complex Test Passed!" << endl;
 	cout << "Parser Tests Passed! :D" << endl;
 
 	}
@@ -35,17 +41,26 @@ void testSimple ( )
 
 	Parser parser( &urlFrontierTest );
 	auto dictionary = parser.execute( &document );
-
+	for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+		{
+		cout << it->first << ':';
+		for ( int i = 0; i < it->second.size( ); ++i )
+			{
+			cout << it->second[ i ] << " ";
+			}
+		cout << std::endl;
+		}
 	assert ( dictionary != nullptr );
-	assert ( dictionary->size( ) == 2 );
-	assert ( dictionary->find( "cat" ) != dictionary->end( ) );
-	assert ( dictionary->find( "titl" ) != dictionary->end( ) );
-	assert ( dictionary->find( "this" ) == dictionary->end( ) );
-	assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 );
-	assert ( dictionary->at( "titl" )[ 0 ] == 1 );
+	assert ( dictionary->size( ) == 4 );
+	assert ( dictionary->find( "#cat" ) != dictionary->end( ) );
+	assert ( dictionary->find( "$testurl" ) != dictionary->end( ) );
+	assert ( dictionary->find( "#titl" ) != dictionary->end( ) );
+	assert ( dictionary->find( "#this" ) == dictionary->end( ) );
+	assert ( dictionary->at( "#cat" )[ 0 ] == 0 && dictionary->at( "#cat" )[ 1 ] == 2 );
+	assert ( dictionary->at( "#titl" )[ 0 ] == 1 );
 
 	delete dictionary;
-
+	dictionary = nullptr;
 	}
 
 void testComplex ( )
@@ -60,7 +75,6 @@ void testComplex ( )
 		{
 		docString += temp;
 		}
-
 	ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" );
 	char *writable = new char[docString.size( ) + 1];
 	std::copy( docString.begin( ), docString.end( ), writable );
@@ -70,19 +84,32 @@ void testComplex ( )
 
 	Parser parser( &urlFrontierTest );
 	auto dictionary = parser.execute( &document );
-
+	for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+		{
+		cout << it->first << ':';
+		for ( int i = 0; i < it->second.size( ); ++i )
+			{
+			cout << it->second[ i ] << " ";
+			}
+		cout << std::endl;
+		}
 	assert ( dictionary != nullptr );
-	assert ( dictionary->size( ) == 3 );
+	assert ( dictionary->size( ) == 8 );
 
-	assert ( dictionary->find( "cat" ) != dictionary->end( ) );
-	assert ( dictionary->find( "stori" ) != dictionary->end( ) );
-	assert ( dictionary->find( "joe" ) != dictionary->end( ) );
+	assert ( dictionary->find( "#cat" ) != dictionary->end( ) );
+	assert ( dictionary->find( "#stori" ) != dictionary->end( ) );
+	assert ( dictionary->find( "#joe" ) != dictionary->end( ) );
+	assert ( dictionary->find( "$w3school" ) != dictionary->end( ) );
+	assert ( dictionary->find( "$test" ) != dictionary->end( ) );
+	assert ( dictionary->find( "$cat" ) != dictionary->end( ) );
 
-	assert ( dictionary->find( "the" ) == dictionary->end( ) );
-	assert ( dictionary->find( "of" ) == dictionary->end( ) );
+	assert ( dictionary->find( "#the" ) == dictionary->end( ) );
+	assert ( dictionary->find( "#of" ) == dictionary->end( ) );
 
 	delete dictionary;
+	dictionary = nullptr;
 	delete[] writable;
+	writable = nullptr;
 
 	}
 
@@ -91,13 +118,27 @@ void testURL ( )
 	const char *line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>";
 
 	ProducerConsumerQueue< string > urlFrontierTest;
-	ParsedUrl url = ParsedUrl( "testurl.com" );
+	ParsedUrl url = ParsedUrl( "http://testurl.com" );
 	char docString[10240];
 	strcpy( docString, line );
 	Document document( url, docString );
 
 	Parser parser( &urlFrontierTest );
-	auto dict = parser.execute( &document );
+	auto dictionary = parser.execute( &document );
+	for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+		{
+		cout << it->first << ':';
+		for ( int i = 0; i < it->second.size( ); ++i )
+			{
+			cout << it->second[ i ] << " ";
+			}
+		cout << std::endl;
+		}
+
 	assert ( urlFrontierTest.Pop( ) == "http://www.bafta.org/");
-	delete dict;
+	assert ( dictionary->find( "$bafta" ) == dictionary->end( ) );
+	assert ( dictionary->find( "$testurl" ) != dictionary->end( ) );
+
+	delete dictionary;
+	dictionary = nullptr;
 	}
\ No newline at end of file
diff --git a/shared/Document.cpp b/shared/Document.cpp
index 54fef82..2a2f3e3 100644
--- a/shared/Document.cpp
+++ b/shared/Document.cpp
@@ -59,6 +59,12 @@ int  Document::WriteToDocMap ( )
 	}
 
 
+ParsedUrl Document::getUrl ( )
+	{
+	return this->url;
+	}
+
+
 void  Document::PrintDocMap ( string url, int location )
 	{
 	pthread_mutex_lock ( &docMap_mutex );
diff --git a/shared/Document.h b/shared/Document.h
index 5581d7b..5de4d87 100644
--- a/shared/Document.h
+++ b/shared/Document.h
@@ -44,6 +44,7 @@ public:
 
 	int WriteToDocMap();
 
+	ParsedUrl getUrl ( );
 
 	static void PrintDocMap( string url, int location );
 	};
\ No newline at end of file
diff --git a/shared/url.h b/shared/url.h
index 44fc016..91c5502 100644
--- a/shared/url.h
+++ b/shared/url.h
@@ -47,7 +47,7 @@ public:
 
 		pathBuffer = new char[ strlen( url ) + 1 ];
 		char *f, *t;
-		for ( t = pathBuffer, f = url;  *t++ = *f++; )
+		for ( t = pathBuffer, f = url;  ( *t++ = *f++ ); )
 			;
 
 		Service = pathBuffer;
diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp
index cad3e44..dfbb9c4 100644
--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -1,12 +1,13 @@
 
 #include "Tokenizer.h"
+#include <iostream>
 
 /**
  * Tokenizer Cstor
  */
 Tokenizer::Tokenizer ( )
 	{
-	docIndex = new unordered_map< string, vector< int>>;
+	docIndex = new unordered_map< string, vector< unsigned long > >;
 	}
 
 /**
@@ -14,7 +15,7 @@ Tokenizer::Tokenizer ( )
  *
  * @return pointer to unordered_map< string, vector< int>>
  */
-unordered_map< string, vector< int>> *Tokenizer::get ( ) const
+unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
 	{
 	return docIndex;
 	}
@@ -23,13 +24,36 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const
  * Executes the Tokenizer
  * Sends tokens to dictionary
  *
- * token -> [offsets]
  * @param originalText
  * @param offset
+ * @param decorator
  */
-void Tokenizer::execute ( string & originalText, unsigned long offset )
+void Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
+	{
+	// split by symbols
+	if ( decorator == Tokenizer::URL )
+		{
+		vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
+		                          '(', ')', '*', '+', ',', ';', '='};
+
+		tokenize( splitStr( originalText, split, true ), offset, decorator );
+		}
+	// split by spaces
+	else
+		{
+		tokenize( splitStr( originalText, ' ', true ), offset, decorator );
+		}
+	}
+
+/**
+ * Tokenizes text (titles, body text)
+ *
+ * @param originalText
+ * @param offset
+ * @param decorator
+ */
+void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
 	{
-	vector< string > splitText = splitStr( originalText, ' ', true );
 	string processedString = "";
 	for ( int i = 0; i < splitText.size( ); ++i )
 		{
@@ -41,8 +65,13 @@ void Tokenizer::execute ( string & originalText, unsigned long offset )
 			{
 			// stem word
 			processedString = stem.execute( processedString );
-			( *docIndex )[ processedString ].push_back( offset );
-			++offset;
+			if ( decorator != '\0' )
+				{
+				processedString = decorator + processedString;
+				}
+				( *docIndex )[ processedString ].push_back( offset );
+				++offset;
 			}
 		}
 	}
+
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index ebf3b90..543f1da 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -1,5 +1,6 @@
 
 #pragma once
+
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -13,6 +14,11 @@ class Tokenizer
 
 public:
 
+	// decorators
+	static const char TITLE = '#';
+	static const char ANCHOR = '@';
+	static const char URL = '$';
+
 	/**
  	* Tokenizer Cstor
  	*/
@@ -23,19 +29,31 @@ public:
 	 *
  	* @return pointer to unordered_map< string, vector< int>>
  	*/
-	unordered_map< string, vector< int>> *get ( ) const;
+	unordered_map< string, vector< unsigned long > > *get ( ) const;
 
 	/**
 	 * Executes the Tokenizer
 	 * Sends tokens to dictionary
 	 *
-	 * token -> [offsets]
+	 *
+	 * @param originalText
+	 * @param offset
+	 * @param decorator
+	 */
+	void execute ( string originalText, unsigned long offset, char decorator = '\0' );
+
+private:
+
+	unordered_map< string, vector< unsigned long > > *docIndex;
+	Stemmer stem;
+
+	/**
+	 * Tokenizes text (titles, body text)
+	 *
 	 * @param originalText
 	 * @param offset
+	 * @param decorator
 	 */
-	void execute ( string &originalText, unsigned long offset );
+	void tokenize ( vector< string > splitText, unsigned long offset, char decorator );
 
-	private:
-		unordered_map< string, vector< int>> *docIndex;
-		Stemmer stem;
 	};
diff --git a/util/stringProcessing.cpp b/util/stringProcessing.cpp
index 4b9b195..8576789 100644
--- a/util/stringProcessing.cpp
+++ b/util/stringProcessing.cpp
@@ -5,7 +5,7 @@
 #include "stringProcessing.h"
 #include "Stemmer.h"
 #include <cassert>
-
+#include <iostream>
 using namespace std;
 
 /**
@@ -181,7 +181,7 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
  * @param removeChars
  * @return vector < string >
  */
-vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
+vector< string > splitStr ( string originalText, char delim , bool removeSyms)
 	{
 	vector< string > splitWords;
 	auto begin = originalText.begin( );
@@ -209,13 +209,72 @@ vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
 
 	}
 
+/**
+ * Splits string by multiple delimiters
+ *
+ * @param originalText
+ * @param delims
+ * @param removeSyms
+ * @return
+ */
+vector< string > splitStr ( string originalText, vector < char > delims , bool removeSyms)
+	{
+	vector< string > splitWords;
+	char begin;
+	for( int i = 0; i < originalText.size( ); ++i)
+		{
+		begin = originalText[i];
+		string word = "";
+		while ( !inArray( begin, delims ) && i < originalText.size() )
+			{
+			begin = originalText[i];
+			if (removeSyms && ( isAlpha( begin ) || isNum( begin ) ) )
+				{
+				word += begin;
+				}
+			++i;
+			}
+
+		if(inArray( begin, delims ))
+			--i;
+
+
+		if (word != "" && word != " " )
+			{
+			splitWords.push_back( word );
+			}
+		}
+
+	return splitWords;
+
+	}
+
+/**
+ * Returns true if element is in array, false otherwise
+ *
+ * @param vec
+ * @return
+ */
+template <typename T> bool inArray ( T needle, vector < T > haystack )
+	{
+	for ( int i = 0; i < haystack.size( ); ++ i)
+		{
+		if ( haystack[ i ] == needle )
+			{
+			return true;
+			}
+		}
+	return false;
+	}
+
+
 /**
  * Returns true if @word is a stopword
  *
  * @param word
  * @return bool
  */
-bool isStopWord ( string & word )
+bool isStopWord ( string word )
 	{
 	return ( stopWords.find( word ) != stopWords.end( ) );
 
@@ -227,7 +286,7 @@ bool isStopWord ( string & word )
  * @param word
  * @return string
  */
-string toLower ( string & word )
+string toLower ( string word )
 	{
 	auto iter = word.begin( );
 	string lowerWord = "";
@@ -254,7 +313,7 @@ string toLower ( string & word )
  * @param word
  * @return string
  */
-string stemWord ( string & word )
+string stemWord ( string word )
 	{
 	Stemmer stemmer;
 	word = stemmer.execute( word );
@@ -269,7 +328,7 @@ string stemWord ( string & word )
  * @param len
  * @return string
  */
-string subStr ( string & word, size_t pos, size_t len )
+string subStr ( string word, size_t pos, size_t len )
 	{
 	string substr = "";
 	for ( int i = 0; i < len; ++i )
@@ -305,7 +364,7 @@ string subStr ( string::iterator begin, string::iterator end )
  * @param chars
  * @return string
  */
-string stripStr ( string & word, vector< char > chars )
+string stripStr ( string word, vector< char > chars )
 	{
 	string wordStripped = "";
 	auto begin = word.begin( );
@@ -337,7 +396,7 @@ string stripStr ( string & word, vector< char > chars )
  * @param chars
  * @return string
  */
-string stripStr ( string & word )
+string stripStr ( string word )
 	{
 	string wordStripped = "";
 	auto begin = word.begin( );
diff --git a/util/stringProcessing.h b/util/stringProcessing.h
index 4e6de29..feca3f8 100644
--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -21,7 +21,7 @@ static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as",
                                    "she",
                                    "some", "the", "their", "them", "there", "they", "that",
                                    "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will",
-                                   "with",
+                                   "with", "www",
                                    "you", "your" };
 
 /**
@@ -62,7 +62,25 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
  * @param removeSyms
  * @return vector< string >
  */
-vector< string > splitStr ( string & originalText, char delim, bool removeSyms );
+vector< string > splitStr ( string originalText, char delim, bool removeSyms );
+
+/**
+ * Splits string by multiple delimiters
+ *
+ * @param originalText
+ * @param delims
+ * @param removeSyms
+ * @return
+ */
+vector< string > splitStr ( string originalText, vector < char > delims, bool removeSyms );
+
+/**
+ * Returns true if element is in array, false otherwise
+ *
+ * @param vec
+ * @return
+ */
+template <typename T> bool inArray ( T needle, vector < T > haystack );
 
 /**
  * Returns true if @word is a stopword
@@ -70,7 +88,7 @@ vector< string > splitStr ( string & originalText, char delim, bool removeSyms )
  * @param word
  * @return bool
  */
-bool isStopWord ( string & word );
+bool isStopWord ( string word );
 
 /**
  * Returns lowercase @word
@@ -78,7 +96,7 @@ bool isStopWord ( string & word );
  * @param word
  * @return string
  */
-string toLower ( string & word );
+string toLower ( string word );
 
 /**
  * Returns stemmed @word
@@ -86,7 +104,7 @@ string toLower ( string & word );
  * @param word
  * @return string
  */
-string stemWord ( string & word );
+string stemWord ( string word );
 
 /**
  * Returns a substring [ post, len )
@@ -96,7 +114,7 @@ string stemWord ( string & word );
  * @param len
  * @return string
  */
-string subStr ( string & word, size_t pos, size_t len );
+string subStr ( string word, size_t pos, size_t len );
 
 /**
  * Returns a substring [ begin, end )
@@ -114,7 +132,7 @@ string subStr ( string::iterator begin, string::iterator end );
  * @param chars
  * @return string
  */
-string stripStr ( string & word, vector< char > chars );
+string stripStr ( string word, vector< char > chars );
 
 /**
  * Removes all chars from word
@@ -123,7 +141,7 @@ string stripStr ( string & word, vector< char > chars );
  * @param word
  * @return string
  */
-string stripStr ( string & word );
+string stripStr ( string word );
 
 /**
  * Returns true is character is a letter
diff --git a/util/tests/tokenizerTest.cpp b/util/tests/tokenizerTest.cpp
index a89e22d..235bf67 100644
--- a/util/tests/tokenizerTest.cpp
+++ b/util/tests/tokenizerTest.cpp
@@ -42,5 +42,5 @@ void testExecute ( string original )
 		cout << std::endl;
 		}
 	delete dict;
-
+	dict = nullptr;
 	}
-- 
GitLab