Skip to content
Snippets Groups Projects
Commit d69dc520 authored by vcday's avatar vcday
Browse files

added verbose

parent f3a234af
No related branches found
No related tags found
No related merge requests found
...@@ -71,10 +71,9 @@ void Spider::FuncToRun() ...@@ -71,10 +71,9 @@ void Spider::FuncToRun()
string pathToDisk = localPath + "/crawlerOutput/" + to_string(docID)+ ".txt"; string pathToDisk = localPath + "/crawlerOutput/" + to_string(docID)+ ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk); int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
Document document ( currentUrl, reader->buffer ); Document document ( currentUrl, reader->buffer );
auto dict = parser.execute ( &document ); auto dict = parser.execute ( &document );
cout << "docID: " << docID << endl;
for ( auto it = dict->begin( ); it != dict->end( ); it++ ) for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{ {
cout << it->first << " : "; cout << it->first << " : ";
...@@ -84,6 +83,8 @@ void Spider::FuncToRun() ...@@ -84,6 +83,8 @@ void Spider::FuncToRun()
} }
cout << std::endl; cout << std::endl;
} }
cout << std::endl;
delete dict;
cond = true; cond = true;
} }
......
...@@ -33,7 +33,7 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume ...@@ -33,7 +33,7 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume
void Parser::parse ( string html, Tokenizer *tokenizer ) void Parser::parse ( string html, Tokenizer *tokenizer )
{ {
auto htmlIt = html.begin( ); auto htmlIt = html.begin( );
int offset = 0; unsigned long offset = 0;
while ( htmlIt != html.end( ) ) while ( htmlIt != html.end( ) )
{ {
// if open bracket // if open bracket
...@@ -49,6 +49,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) ...@@ -49,6 +49,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
if ( url != "" ) if ( url != "" )
{ {
urlFrontier->Push( url ); urlFrontier->Push( url );
cout << url << endl;
} }
// check if line is title // check if line is title
else else
...@@ -59,7 +60,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) ...@@ -59,7 +60,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
tokenizer->execute( title, offset ); tokenizer->execute( title, offset );
} }
} }
//TODO fix offset?
offset = htmlIt - html.begin( ); offset = htmlIt - html.begin( );
} }
else else
...@@ -86,7 +86,11 @@ string Parser::extract_url ( string & word ) ...@@ -86,7 +86,11 @@ string Parser::extract_url ( string & word )
if ( *foundHttp != '\0' ) if ( *foundHttp != '\0' )
{ {
url = ""; url = "";
auto closeTag = findNext( ">", word.begin( ) ); auto closeTag = findNext( ">", foundHref );
if ( *closeTag != '\0' && *( closeTag - 1 ) == '\"' )
{
closeTag -= 1;
}
while ( *foundHttp != *closeTag ) while ( *foundHttp != *closeTag )
{ {
url += *foundHttp; url += *foundHttp;
......
...@@ -9,13 +9,37 @@ ...@@ -9,13 +9,37 @@
using namespace std; using namespace std;
void testSimple ( ); void testSimple ( );
void testComplex ( ); void testComplex ( );
int main ( ) int main ( )
{ {
cout << "Testing Parser ... " << endl << endl; cout << "Testing Parser ... " << endl << endl;
testSimple ();
testComplex (); const char * line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>";
ProducerConsumerQueue< string > urlFrontierTest;
ParsedUrl url = ParsedUrl( "testurl.com" );
char docString[10240];
strcpy( docString, line );
Document document( url, docString );
Parser parser( &urlFrontierTest );
auto dict = parser.execute( &document );
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
// testSimple( );
// testComplex( );
cout << "Parser Tests Passed! :D" << endl; cout << "Parser Tests Passed! :D" << endl;
} }
...@@ -23,60 +47,61 @@ int main ( ) ...@@ -23,60 +47,61 @@ int main ( )
void testSimple ( ) void testSimple ( )
{ {
ProducerConsumerQueue < string > urlFrontierTest; ProducerConsumerQueue< string > urlFrontierTest;
ParsedUrl url = ParsedUrl("testurl.com"); ParsedUrl url = ParsedUrl( "testurl.com" );
char docString[10240]; char docString[10240];
strcpy(docString, "<title>This Cat Title Cat</title>"); strcpy( docString, "<title>This Cat Title Cat</title>" );
Document document ( url, docString); Document document( url, docString );
Parser parser ( &urlFrontierTest ); Parser parser( &urlFrontierTest );
auto dictionary = parser.execute ( &document ); auto dictionary = parser.execute( &document );
assert ( dictionary != nullptr ); assert ( dictionary != nullptr );
assert ( dictionary->size () == 2); assert ( dictionary->size( ) == 2 );
assert ( dictionary->find ( "cat" ) != dictionary->end () ); assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find ( "titl" ) != dictionary->end () ); assert ( dictionary->find( "titl" ) != dictionary->end( ) );
assert ( dictionary->find ( "this" ) == dictionary->end () ); assert ( dictionary->find( "this" ) == dictionary->end( ) );
assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 ); assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 );
assert ( dictionary->at ( "titl" )[ 0 ] == 1 ); assert ( dictionary->at( "titl" )[ 0 ] == 1 );
delete dictionary; delete dictionary;
} }
void testComplex ( ) void testComplex ( )
{ {
ProducerConsumerQueue < string > urlFrontierTest; ProducerConsumerQueue< string > urlFrontierTest;
ifstream file("../tests/cats.html"); ifstream file( "../tests/cats.html" );
string temp; string temp;
string docString = "<title>Joe the Cat</title>\n"; string docString = "<title>Joe the Cat</title>\n";
docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n"; docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
while ( std::getline ( file, temp ) ) while ( std::getline( file, temp ) )
{ {
docString += temp; docString += temp;
} }
ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html"); ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" );
char * writable = new char[docString.size( ) + 1]; char *writable = new char[docString.size( ) + 1];
std::copy(docString.begin( ), docString.end( ), writable); std::copy( docString.begin( ), docString.end( ), writable );
writable[ docString.size( ) ] = '\0'; writable[ docString.size( ) ] = '\0';
Document document ( url, writable ); Document document( url, writable );
Parser parser ( &urlFrontierTest ); Parser parser( &urlFrontierTest );
auto dictionary = parser.execute ( &document ); auto dictionary = parser.execute( &document );
assert ( dictionary != nullptr ); assert ( dictionary != nullptr );
assert ( dictionary->size () == 3); assert ( dictionary->size( ) == 3 );
assert ( dictionary->find ( "cat" ) != dictionary->end () ); assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find ( "stori" ) != dictionary->end () ); assert ( dictionary->find( "stori" ) != dictionary->end( ) );
assert ( dictionary->find ( "joe" ) != dictionary->end () ); assert ( dictionary->find( "joe" ) != dictionary->end( ) );
assert ( dictionary->find ( "the" ) == dictionary->end () ); assert ( dictionary->find( "the" ) == dictionary->end( ) );
assert ( dictionary->find ( "of" ) == dictionary->end () ); assert ( dictionary->find( "of" ) == dictionary->end( ) );
delete dictionary; delete dictionary;
delete[] writable; delete[] writable;
} }
\ No newline at end of file
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
...@@ -27,22 +27,20 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const ...@@ -27,22 +27,20 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const
* @param originalText * @param originalText
* @param offset * @param offset
*/ */
void Tokenizer::execute ( string & originalText, int offset ) void Tokenizer::execute ( string & originalText, unsigned long offset )
{ {
vector< string > splitText = splitStr( originalText, ' ' ); vector< string > splitText = splitStr( originalText, ' ', true );
string processedString = ""; string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i ) for ( int i = 0; i < splitText.size( ); ++i )
{ {
// case fold // case fold
processedString = toLower( splitText[ i ] ); processedString = toLower( splitText[ i ] );
//strip all characters //strip all characters
processedString = stripStr( processedString );
if ( !isStopWord( processedString ) ) if ( !isStopWord( processedString ) )
{ {
// stem word // stem word
//FIXME processedString = stem.execute( processedString );
// processedString = stem.execute( processedString );
( *docIndex )[ processedString ].push_back( offset ); ( *docIndex )[ processedString ].push_back( offset );
++offset; ++offset;
} }
......
...@@ -33,7 +33,7 @@ public: ...@@ -33,7 +33,7 @@ public:
* @param originalText * @param originalText
* @param offset * @param offset
*/ */
void execute ( string &originalText, int offset ); void execute ( string &originalText, unsigned long offset );
private: private:
unordered_map< string, vector< int>> *docIndex; unordered_map< string, vector< int>> *docIndex;
......
...@@ -174,12 +174,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str ...@@ -174,12 +174,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
/** /**
* Returns a vector of strings from @originalText, split by @delim * Returns a vector of strings from @originalText, split by @delim
* Will remove symbols if bool is set
* *
* @param originalText * @param originalText
* @param delim * @param delim
* @param removeChars
* @return vector < string > * @return vector < string >
*/ */
vector< string > splitStr ( string & originalText, char delim ) vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
{ {
vector< string > splitWords; vector< string > splitWords;
auto begin = originalText.begin( ); auto begin = originalText.begin( );
...@@ -189,7 +191,10 @@ vector< string > splitStr ( string & originalText, char delim ) ...@@ -189,7 +191,10 @@ vector< string > splitStr ( string & originalText, char delim )
string word = ""; string word = "";
while ( *begin != delim && *begin != '\0' ) while ( *begin != delim && *begin != '\0' )
{ {
word += *begin; if (removeSyms && ( isAlpha( *begin ) || isNum( *begin ) ) )
{
word += *begin;
}
++begin; ++begin;
} }
......
...@@ -55,12 +55,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str ...@@ -55,12 +55,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
/** /**
* Returns a vector of strings from @originalText, split by @delim * Returns a vector of strings from @originalText, split by @delim
* Will remove symbols if bool is set
* *
* @param originalText * @param originalText
* @param delim * @param delim
* @param removeSyms
* @return vector< string > * @return vector< string >
*/ */
vector< string > splitStr ( string & originalText, char delim ); vector< string > splitStr ( string & originalText, char delim, bool removeSyms );
/** /**
* Returns true if @word is a stopword * Returns true if @word is a stopword
......
...@@ -136,11 +136,11 @@ void testSplitStr ( string original ) ...@@ -136,11 +136,11 @@ void testSplitStr ( string original )
{ {
cout << "Testing splitStr..." << endl; cout << "Testing splitStr..." << endl;
vector< string > vec = splitStr( original, ' ' ); vector< string > vec = splitStr( original, ' ', true);
assert( vec.size( ) == 53 ); assert( vec.size( ) == 53 );
string word = "hello\ngoodbye"; string word = "hello\ngoodbye";
vec = splitStr( word, '\n' ); vec = splitStr( word, '\n', true );
assert( vec.size( ) == 2 ); assert( vec.size( ) == 2 );
assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" ); assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment