Skip to content
Snippets Groups Projects
Commit d69dc520 authored by vcday's avatar vcday
Browse files

added verbose

parent f3a234af
No related branches found
No related tags found
No related merge requests found
......@@ -71,10 +71,9 @@ void Spider::FuncToRun()
string pathToDisk = localPath + "/crawlerOutput/" + to_string(docID)+ ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
Document document ( currentUrl, reader->buffer );
auto dict = parser.execute ( &document );
cout << "docID: " << docID << endl;
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
cout << it->first << " : ";
......@@ -84,6 +83,8 @@ void Spider::FuncToRun()
}
cout << std::endl;
}
cout << std::endl;
delete dict;
cond = true;
}
......
......@@ -33,7 +33,7 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume
void Parser::parse ( string html, Tokenizer *tokenizer )
{
auto htmlIt = html.begin( );
int offset = 0;
unsigned long offset = 0;
while ( htmlIt != html.end( ) )
{
// if open bracket
......@@ -49,6 +49,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
if ( url != "" )
{
urlFrontier->Push( url );
cout << url << endl;
}
// check if line is title
else
......@@ -59,7 +60,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
tokenizer->execute( title, offset );
}
}
//TODO fix offset?
offset = htmlIt - html.begin( );
}
else
......@@ -86,7 +86,11 @@ string Parser::extract_url ( string & word )
if ( *foundHttp != '\0' )
{
url = "";
auto closeTag = findNext( ">", word.begin( ) );
auto closeTag = findNext( ">", foundHref );
if ( *closeTag != '\0' && *( closeTag - 1 ) == '\"' )
{
closeTag -= 1;
}
while ( *foundHttp != *closeTag )
{
url += *foundHttp;
......
......@@ -9,13 +9,37 @@
using namespace std;
void testSimple ( );
void testComplex ( );
int main ( )
{
cout << "Testing Parser ... " << endl << endl;
testSimple ();
testComplex ();
const char * line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>";
ProducerConsumerQueue< string > urlFrontierTest;
ParsedUrl url = ParsedUrl( "testurl.com" );
char docString[10240];
strcpy( docString, line );
Document document( url, docString );
Parser parser( &urlFrontierTest );
auto dict = parser.execute( &document );
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
// testSimple( );
// testComplex( );
cout << "Parser Tests Passed! :D" << endl;
}
......@@ -23,60 +47,61 @@ int main ( )
void testSimple ( )
{
ProducerConsumerQueue < string > urlFrontierTest;
ParsedUrl url = ParsedUrl("testurl.com");
ProducerConsumerQueue< string > urlFrontierTest;
ParsedUrl url = ParsedUrl( "testurl.com" );
char docString[10240];
strcpy(docString, "<title>This Cat Title Cat</title>");
Document document ( url, docString);
strcpy( docString, "<title>This Cat Title Cat</title>" );
Document document( url, docString );
Parser parser ( &urlFrontierTest );
auto dictionary = parser.execute ( &document );
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
assert ( dictionary != nullptr );
assert ( dictionary->size () == 2);
assert ( dictionary->find ( "cat" ) != dictionary->end () );
assert ( dictionary->find ( "titl" ) != dictionary->end () );
assert ( dictionary->find ( "this" ) == dictionary->end () );
assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
assert ( dictionary->at ( "titl" )[ 0 ] == 1 );
assert ( dictionary->size( ) == 2 );
assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find( "titl" ) != dictionary->end( ) );
assert ( dictionary->find( "this" ) == dictionary->end( ) );
assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 );
assert ( dictionary->at( "titl" )[ 0 ] == 1 );
delete dictionary;
}
void testComplex ( )
{
ProducerConsumerQueue < string > urlFrontierTest;
ifstream file("../tests/cats.html");
ProducerConsumerQueue< string > urlFrontierTest;
ifstream file( "../tests/cats.html" );
string temp;
string docString = "<title>Joe the Cat</title>\n";
docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
while ( std::getline ( file, temp ) )
while ( std::getline( file, temp ) )
{
docString += temp;
}
ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html");
char * writable = new char[docString.size( ) + 1];
std::copy(docString.begin( ), docString.end( ), writable);
ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" );
char *writable = new char[docString.size( ) + 1];
std::copy( docString.begin( ), docString.end( ), writable );
writable[ docString.size( ) ] = '\0';
Document document ( url, writable );
Document document( url, writable );
Parser parser ( &urlFrontierTest );
auto dictionary = parser.execute ( &document );
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
assert ( dictionary != nullptr );
assert ( dictionary->size () == 3);
assert ( dictionary->size( ) == 3 );
assert ( dictionary->find ( "cat" ) != dictionary->end () );
assert ( dictionary->find ( "stori" ) != dictionary->end () );
assert ( dictionary->find ( "joe" ) != dictionary->end () );
assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find( "stori" ) != dictionary->end( ) );
assert ( dictionary->find( "joe" ) != dictionary->end( ) );
assert ( dictionary->find ( "the" ) == dictionary->end () );
assert ( dictionary->find ( "of" ) == dictionary->end () );
assert ( dictionary->find( "the" ) == dictionary->end( ) );
assert ( dictionary->find( "of" ) == dictionary->end( ) );
delete dictionary;
delete[] writable;
}
\ No newline at end of file
}
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
......@@ -27,22 +27,20 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const
* @param originalText
* @param offset
*/
void Tokenizer::execute ( string & originalText, int offset )
void Tokenizer::execute ( string & originalText, unsigned long offset )
{
vector< string > splitText = splitStr( originalText, ' ' );
vector< string > splitText = splitStr( originalText, ' ', true );
string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i )
{
// case fold
processedString = toLower( splitText[ i ] );
//strip all characters
processedString = stripStr( processedString );
if ( !isStopWord( processedString ) )
{
// stem word
//FIXME
// processedString = stem.execute( processedString );
processedString = stem.execute( processedString );
( *docIndex )[ processedString ].push_back( offset );
++offset;
}
......
......@@ -33,7 +33,7 @@ public:
* @param originalText
* @param offset
*/
void execute ( string &originalText, int offset );
void execute ( string &originalText, unsigned long offset );
private:
unordered_map< string, vector< int>> *docIndex;
......
......@@ -174,12 +174,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
/**
* Returns a vector of strings from @originalText, split by @delim
* Will remove symbols if bool is set
*
* @param originalText
* @param delim
* @param removeChars
* @return vector < string >
*/
vector< string > splitStr ( string & originalText, char delim )
vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
{
vector< string > splitWords;
auto begin = originalText.begin( );
......@@ -189,7 +191,10 @@ vector< string > splitStr ( string & originalText, char delim )
string word = "";
while ( *begin != delim && *begin != '\0' )
{
word += *begin;
if (removeSyms && ( isAlpha( *begin ) || isNum( *begin ) ) )
{
word += *begin;
}
++begin;
}
......
......@@ -55,12 +55,14 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
/**
* Returns a vector of strings from @originalText, split by @delim
* Will remove symbols if bool is set
*
* @param originalText
* @param delim
* @param removeSyms
* @return vector< string >
*/
vector< string > splitStr ( string & originalText, char delim );
vector< string > splitStr ( string & originalText, char delim, bool removeSyms );
/**
* Returns true if @word is a stopword
......
......@@ -136,11 +136,11 @@ void testSplitStr ( string original )
{
cout << "Testing splitStr..." << endl;
vector< string > vec = splitStr( original, ' ' );
vector< string > vec = splitStr( original, ' ', true);
assert( vec.size( ) == 53 );
string word = "hello\ngoodbye";
vec = splitStr( word, '\n' );
vec = splitStr( word, '\n', true );
assert( vec.size( ) == 2 );
assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment