added verbose

d69dc520 · vcday · f3a234af · d69dc520 · d69dc520 · d69dc520
Commit d69dc520 authored 7 years ago by vcday
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -71,10 +71,9 @@ void Spider::FuncToRun()
 				string pathToDisk = localPath + "/crawlerOutput/" + to_string(docID)+ ".txt";
 				int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
 				Document document ( currentUrl, reader->buffer );
 				auto dict = parser.execute ( &document );
+				cout << "docID: " << docID << endl;
 				for ( auto it = dict->begin( ); it != dict->end( ); it++ )
 					{
 					cout << it->first << " : ";
@@ -84,6 +83,8 @@ void Spider::FuncToRun()
 						}
 					cout << std::endl;
 					}
+				cout << std::endl;
+				delete dict;
 				cond = true;
 				}

--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -33,7 +33,7 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume
 void Parser::parse ( string html, Tokenizer *tokenizer )
 	{
 	auto htmlIt = html.begin( );
-	int offset = 0;
+	unsigned long offset = 0;
 	while ( htmlIt != html.end( ) )
 		{
 		// if open bracket
@@ -49,6 +49,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 			if ( url != "" )
 				{
 				urlFrontier->Push( url );
+				cout << url << endl;
 				}
 				// check if line is title
 			else
@@ -59,7 +60,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 					tokenizer->execute( title, offset );
 					}
 				}
-			//TODO fix offset?
 			offset = htmlIt - html.begin( );
 			}
 		else
@@ -86,7 +86,11 @@ string Parser::extract_url ( string & word )
 		if ( *foundHttp != '\0' )
 			{
 			url = "";
-			auto closeTag = findNext( ">", word.begin( ) );
+			auto closeTag = findNext( ">", foundHref );
+			if ( *closeTag != '\0' && *( closeTag - 1 ) == '\"' )
+				{
+				closeTag -= 1;
+				}
 			while ( *foundHttp != *closeTag )
 				{
 				url += *foundHttp;

--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
@@ -9,13 +9,37 @@
 using namespace std;
 void testSimple ( );
 void testComplex ( );
 int main ( )
 	{
 	cout << "Testing Parser ... " << endl << endl;
-	testSimple ();
-	testComplex ();
+	const char * line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>";
+	ProducerConsumerQueue< string > urlFrontierTest;
+	ParsedUrl url = ParsedUrl( "testurl.com" );
+	char docString[10240];
+	strcpy( docString, line );
+	Document document( url, docString );
+	Parser parser( &urlFrontierTest );
+	auto dict = parser.execute( &document );
+	for ( auto it = dict->begin( ); it != dict->end( ); it++ )
+		{
+		cout << it->first << ':';
+		for ( int i = 0; i < it->second.size( ); ++i )
+			{
+			cout << it->second[ i ] << " ";
+			}
+		cout << std::endl;
+		}
+//	testSimple( );
+//	testComplex( );
 	cout << "Parser Tests Passed! :D" << endl;
 	}
@@ -23,60 +47,61 @@ int main ( )
 void testSimple ( )
 	{
-	ProducerConsumerQueue < string >  urlFrontierTest;
+	ProducerConsumerQueue< string > urlFrontierTest;
-	ParsedUrl url = ParsedUrl("testurl.com");
+	ParsedUrl url = ParsedUrl( "testurl.com" );
 	char docString[10240];
-	strcpy(docString, "<title>This Cat Title Cat</title>");
+	strcpy( docString, "<title>This Cat Title Cat</title>" );
-	Document document ( url, docString);
+	Document document( url, docString );
-	Parser parser ( &urlFrontierTest );
+	Parser parser( &urlFrontierTest );
-	auto dictionary = parser.execute ( &document );
+	auto dictionary = parser.execute( &document );
 	assert ( dictionary != nullptr );
-	assert ( dictionary->size () == 2);
+	assert ( dictionary->size( ) == 2 );
-	assert ( dictionary->find ( "cat" ) != dictionary->end () );
+	assert ( dictionary->find( "cat" ) != dictionary->end( ) );
-	assert ( dictionary->find ( "titl" ) != dictionary->end () );
+	assert ( dictionary->find( "titl" ) != dictionary->end( ) );
-	assert ( dictionary->find ( "this" ) == dictionary->end () );
+	assert ( dictionary->find( "this" ) == dictionary->end( ) );
-	assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
+	assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 );
-	assert ( dictionary->at ( "titl" )[ 0 ] == 1 );
+	assert ( dictionary->at( "titl" )[ 0 ] == 1 );
 	delete dictionary;
 	}
 void testComplex ( )
 	{
-	ProducerConsumerQueue < string >  urlFrontierTest;
+	ProducerConsumerQueue< string > urlFrontierTest;
-	ifstream file("../tests/cats.html");
+	ifstream file( "../tests/cats.html" );
 	string temp;
 	string docString = "<title>Joe the Cat</title>\n";
 	docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
-	while ( std::getline ( file, temp ) )
+	while ( std::getline( file, temp ) )
 		{
 		docString += temp;
 		}
-	ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html");
+	ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" );
-	char * writable = new char[docString.size( ) + 1];
+	char *writable = new char[docString.size( ) + 1];
-	std::copy(docString.begin( ), docString.end( ), writable);
+	std::copy( docString.begin( ), docString.end( ), writable );
 	writable[ docString.size( ) ] = '\0';
-	Document document ( url, writable );
+	Document document( url, writable );
-	Parser parser ( &urlFrontierTest );
+	Parser parser( &urlFrontierTest );
-	auto dictionary = parser.execute ( &document );
+	auto dictionary = parser.execute( &document );
 	assert ( dictionary != nullptr );
-	assert ( dictionary->size () == 3);
+	assert ( dictionary->size( ) == 3 );
-	assert ( dictionary->find ( "cat" ) != dictionary->end () );
+	assert ( dictionary->find( "cat" ) != dictionary->end( ) );
-	assert ( dictionary->find ( "stori" ) != dictionary->end () );
+	assert ( dictionary->find( "stori" ) != dictionary->end( ) );
-	assert ( dictionary->find ( "joe" ) != dictionary->end () );
+	assert ( dictionary->find( "joe" ) != dictionary->end( ) );
-	assert ( dictionary->find ( "the" ) == dictionary->end () );
+	assert ( dictionary->find( "the" ) == dictionary->end( ) );
-	assert ( dictionary->find ( "of" ) == dictionary->end () );
+	assert ( dictionary->find( "of" ) == dictionary->end( ) );
 	delete dictionary;
 	delete[] writable;
 	}
\ No newline at end of file
--- a/tests/crawlerOutput_1.txt
+++ b/tests/crawlerOutput_1.txt
--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -27,22 +27,20 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const
 * @param originalText
 * @param offset
 */
-void Tokenizer::execute ( string & originalText, int offset )
+void Tokenizer::execute ( string & originalText, unsigned long offset )
 	{
-	vector< string > splitText = splitStr( originalText, ' ' );
+	vector< string > splitText = splitStr( originalText, ' ', true );
 	string processedString = "";
 	for ( int i = 0; i < splitText.size( ); ++i )
 		{
 		// case fold
 		processedString = toLower( splitText[ i ] );
 		//strip all characters
-		processedString = stripStr( processedString );
 		if ( !isStopWord( processedString ) )
 			{
 			// stem word
-			//FIXME
+			processedString = stem.execute( processedString );
-//			processedString = stem.execute( processedString );
 			( *docIndex )[ processedString ].push_back( offset );
 			++offset;
 			}

--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -33,7 +33,7 @@ public:
 	 * @param originalText
 	 * @param offset
 	 */
-	void execute ( string &originalText, int offset );
+	void execute ( string &originalText, unsigned long offset );
 	private:
 		unordered_map< string, vector< int>> *docIndex;

--- a/util/stringProcessing.cpp
+++ b/util/stringProcessing.cpp
@@ -174,12 +174,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
 /**
 * Returns a vector of strings from @originalText, split by @delim
+ * Will remove symbols if bool is set
 *
 * @param originalText
 * @param delim
+ * @param removeChars
 * @return vector < string >
 */
-vector< string > splitStr ( string & originalText, char delim )
+vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
 	{
 	vector< string > splitWords;
 	auto begin = originalText.begin( );
@@ -189,7 +191,10 @@ vector< string > splitStr ( string & originalText, char delim )
 		string word = "";
 		while ( *begin != delim && *begin != '\0' )
 			{
-			word += *begin;
+			if (removeSyms && ( isAlpha( *begin ) || isNum( *begin ) ) )
+				{
+				word += *begin;
+				}
 			++begin;
 			}

--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -55,12 +55,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
 /**
 * Returns a vector of strings from @originalText, split by @delim
+ * Will remove symbols if bool is set
 *
 * @param originalText
 * @param delim
+ * @param removeSyms
 * @return vector< string >
 */
-vector< string > splitStr ( string & originalText, char delim );
+vector< string > splitStr ( string & originalText, char delim, bool removeSyms );
 /**
 * Returns true if @word is a stopword

--- a/util/tests/stringProcessingTest.cpp
+++ b/util/tests/stringProcessingTest.cpp
@@ -136,11 +136,11 @@ void testSplitStr ( string original )
 	{
 	cout << "Testing splitStr..." << endl;
-	vector< string > vec = splitStr( original, ' ' );
+	vector< string > vec = splitStr( original, ' ', true);
 	assert( vec.size( ) == 53 );
 	string word = "hello\ngoodbye";
-	vec = splitStr( word, '\n' );
+	vec = splitStr( word, '\n', true );
 	assert( vec.size( ) == 2 );
 	assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );