Skip to content
Snippets Groups Projects
Commit 2159787c authored by vcday's avatar vcday
Browse files

fixed weird parsing errors

parent f2b25037
No related branches found
No related tags found
No related merge requests found
......@@ -73,6 +73,7 @@ void Spider::FuncToRun()
Document document ( currentUrl, reader->buffer );
auto dict = parser.execute ( &document );
cout << "docID: " << docID << endl;
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
......@@ -85,7 +86,7 @@ void Spider::FuncToRun()
}
cout << std::endl;
delete dict;
dict = nullptr;
cond = true;
}
else
......
......@@ -16,10 +16,10 @@ Parser::Parser ( ProducerConsumerQueue< string > *urlFrontierIn )
* Executes the Parser
* @return
*/
const unordered_map< string, vector< int > > *Parser::execute ( Document *document )
const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document )
{
Tokenizer tokenizer;
parse( document->DocToString( ), &tokenizer );
parse( document->DocToString( ), document->getUrl( ), &tokenizer );
return tokenizer.get( );
}
......@@ -28,12 +28,21 @@ const unordered_map< string, vector< int > > *Parser::execute ( Document *docume
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void Parser::parse ( string html, Tokenizer *tokenizer )
void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
{
auto htmlIt = html.begin( );
unsigned long offset = 0;
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string url = host + "/" + path;
tokenizer->execute( url, offset, Tokenizer::URL );
while ( htmlIt != html.end( ) )
{
// if open bracket
......@@ -48,7 +57,14 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
string url = extract_url( line );
if ( url != "" )
{
if ( isLocal ( url ) )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
urlFrontier->Push( url );
cout << url << endl;
}
// check if line is title
else
......@@ -56,7 +72,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
string title = extract_title( line );
if ( title != "" )
{
tokenizer->execute( title, offset );
tokenizer->execute( title, offset, Tokenizer::TITLE );
}
}
offset = htmlIt - html.begin( );
......@@ -123,3 +139,13 @@ string Parser::extract_title ( string & word )
return title;
}
/**
* Will return true if local url
*
* @param url
* @return
*/
bool Parser::isLocal ( string url )
{
return ( *url.begin( ) == '/' );
}
\ No newline at end of file
......@@ -33,7 +33,7 @@ public:
* Executes the Parser
* @return
*/
const unordered_map< string, vector< int> > *execute ( Document *document );
const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
private:
......@@ -44,9 +44,7 @@ private:
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void parse ( string html, Tokenizer *tokenizer );
void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
/**
......@@ -64,6 +62,13 @@ private:
*/
string extract_title ( string & word );
/**
* Will return true if local url
*
* @param url
* @return
*/
bool isLocal ( string url );
};
......@@ -17,9 +17,15 @@ void testURL ( );
int main ( )
{
cout << "Testing Parser ... " << endl << endl;
cout << "Testing URL: " << endl;
testURL ( );
cout << "URL Test Passed!" << endl << endl;
cout << "Testing Simple: " << endl;
testSimple( );
cout << "Simple Test Passed!" << endl << endl;
cout << "Testing Complex: " << endl;
testComplex( );
cout << "Complex Test Passed!" << endl;
cout << "Parser Tests Passed! :D" << endl;
}
......@@ -35,17 +41,26 @@ void testSimple ( )
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
assert ( dictionary != nullptr );
assert ( dictionary->size( ) == 2 );
assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find( "titl" ) != dictionary->end( ) );
assert ( dictionary->find( "this" ) == dictionary->end( ) );
assert ( dictionary->at( "cat" )[ 0 ] == 0 && dictionary->at( "cat" )[ 1 ] == 2 );
assert ( dictionary->at( "titl" )[ 0 ] == 1 );
assert ( dictionary->size( ) == 4 );
assert ( dictionary->find( "#cat" ) != dictionary->end( ) );
assert ( dictionary->find( "$testurl" ) != dictionary->end( ) );
assert ( dictionary->find( "#titl" ) != dictionary->end( ) );
assert ( dictionary->find( "#this" ) == dictionary->end( ) );
assert ( dictionary->at( "#cat" )[ 0 ] == 0 && dictionary->at( "#cat" )[ 1 ] == 2 );
assert ( dictionary->at( "#titl" )[ 0 ] == 1 );
delete dictionary;
dictionary = nullptr;
}
void testComplex ( )
......@@ -60,7 +75,6 @@ void testComplex ( )
{
docString += temp;
}
ParsedUrl url = ParsedUrl( "https://www.w3schools.com/tests/cats.html" );
char *writable = new char[docString.size( ) + 1];
std::copy( docString.begin( ), docString.end( ), writable );
......@@ -70,19 +84,32 @@ void testComplex ( )
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
assert ( dictionary != nullptr );
assert ( dictionary->size( ) == 3 );
assert ( dictionary->size( ) == 8 );
assert ( dictionary->find( "cat" ) != dictionary->end( ) );
assert ( dictionary->find( "stori" ) != dictionary->end( ) );
assert ( dictionary->find( "joe" ) != dictionary->end( ) );
assert ( dictionary->find( "#cat" ) != dictionary->end( ) );
assert ( dictionary->find( "#stori" ) != dictionary->end( ) );
assert ( dictionary->find( "#joe" ) != dictionary->end( ) );
assert ( dictionary->find( "$w3school" ) != dictionary->end( ) );
assert ( dictionary->find( "$test" ) != dictionary->end( ) );
assert ( dictionary->find( "$cat" ) != dictionary->end( ) );
assert ( dictionary->find( "the" ) == dictionary->end( ) );
assert ( dictionary->find( "of" ) == dictionary->end( ) );
assert ( dictionary->find( "#the" ) == dictionary->end( ) );
assert ( dictionary->find( "#of" ) == dictionary->end( ) );
delete dictionary;
dictionary = nullptr;
delete[] writable;
writable = nullptr;
}
......@@ -91,13 +118,27 @@ void testURL ( )
const char *line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>";
ProducerConsumerQueue< string > urlFrontierTest;
ParsedUrl url = ParsedUrl( "testurl.com" );
ParsedUrl url = ParsedUrl( "http://testurl.com" );
char docString[10240];
strcpy( docString, line );
Document document( url, docString );
Parser parser( &urlFrontierTest );
auto dict = parser.execute( &document );
auto dictionary = parser.execute( &document );
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
assert ( urlFrontierTest.Pop( ) == "http://www.bafta.org/");
delete dict;
assert ( dictionary->find( "$bafta" ) == dictionary->end( ) );
assert ( dictionary->find( "$testurl" ) != dictionary->end( ) );
delete dictionary;
dictionary = nullptr;
}
\ No newline at end of file
......@@ -59,6 +59,12 @@ int Document::WriteToDocMap ( )
}
ParsedUrl Document::getUrl ( )
{
return this->url;
}
void Document::PrintDocMap ( string url, int location )
{
pthread_mutex_lock ( &docMap_mutex );
......
......@@ -44,6 +44,7 @@ public:
int WriteToDocMap();
ParsedUrl getUrl ( );
static void PrintDocMap( string url, int location );
};
\ No newline at end of file
......@@ -47,7 +47,7 @@ public:
pathBuffer = new char[ strlen( url ) + 1 ];
char *f, *t;
for ( t = pathBuffer, f = url; *t++ = *f++; )
for ( t = pathBuffer, f = url; ( *t++ = *f++ ); )
;
Service = pathBuffer;
......
#include "Tokenizer.h"
#include <iostream>
/**
* Tokenizer Cstor
*/
Tokenizer::Tokenizer ( )
{
docIndex = new unordered_map< string, vector< int>>;
docIndex = new unordered_map< string, vector< unsigned long > >;
}
/**
......@@ -14,7 +15,7 @@ Tokenizer::Tokenizer ( )
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map< string, vector< int>> *Tokenizer::get ( ) const
unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
{
return docIndex;
}
......@@ -23,13 +24,36 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const
* Executes the Tokenizer
* Sends tokens to dictionary
*
* token -> [offsets]
* @param originalText
* @param offset
* @param decorator
*/
void Tokenizer::execute ( string & originalText, unsigned long offset )
void Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
{
// split by symbols
if ( decorator == Tokenizer::URL )
{
vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '='};
tokenize( splitStr( originalText, split, true ), offset, decorator );
}
// split by spaces
else
{
tokenize( splitStr( originalText, ' ', true ), offset, decorator );
}
}
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
{
vector< string > splitText = splitStr( originalText, ' ', true );
string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i )
{
......@@ -41,8 +65,13 @@ void Tokenizer::execute ( string & originalText, unsigned long offset )
{
// stem word
processedString = stem.execute( processedString );
( *docIndex )[ processedString ].push_back( offset );
++offset;
if ( decorator != '\0' )
{
processedString = decorator + processedString;
}
( *docIndex )[ processedString ].push_back( offset );
++offset;
}
}
}
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
......@@ -13,6 +14,11 @@ class Tokenizer
public:
// decorators
static const char TITLE = '#';
static const char ANCHOR = '@';
static const char URL = '$';
/**
* Tokenizer Cstor
*/
......@@ -23,19 +29,31 @@ public:
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map< string, vector< int>> *get ( ) const;
unordered_map< string, vector< unsigned long > > *get ( ) const;
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
* token -> [offsets]
*
* @param originalText
* @param offset
* @param decorator
*/
void execute ( string originalText, unsigned long offset, char decorator = '\0' );
private:
unordered_map< string, vector< unsigned long > > *docIndex;
Stemmer stem;
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
void execute ( string &originalText, unsigned long offset );
void tokenize ( vector< string > splitText, unsigned long offset, char decorator );
private:
unordered_map< string, vector< int>> *docIndex;
Stemmer stem;
};
......@@ -5,7 +5,7 @@
#include "stringProcessing.h"
#include "Stemmer.h"
#include <cassert>
#include <iostream>
using namespace std;
/**
......@@ -181,7 +181,7 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
* @param removeChars
* @return vector < string >
*/
vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
vector< string > splitStr ( string originalText, char delim , bool removeSyms)
{
vector< string > splitWords;
auto begin = originalText.begin( );
......@@ -209,13 +209,72 @@ vector< string > splitStr ( string & originalText, char delim , bool removeSyms)
}
/**
* Splits string by multiple delimiters
*
* @param originalText
* @param delims
* @param removeSyms
* @return
*/
vector< string > splitStr ( string originalText, vector < char > delims , bool removeSyms)
{
vector< string > splitWords;
char begin;
for( int i = 0; i < originalText.size( ); ++i)
{
begin = originalText[i];
string word = "";
while ( !inArray( begin, delims ) && i < originalText.size() )
{
begin = originalText[i];
if (removeSyms && ( isAlpha( begin ) || isNum( begin ) ) )
{
word += begin;
}
++i;
}
if(inArray( begin, delims ))
--i;
if (word != "" && word != " " )
{
splitWords.push_back( word );
}
}
return splitWords;
}
/**
* Returns true if element is in array, false otherwise
*
* @param vec
* @return
*/
template <typename T> bool inArray ( T needle, vector < T > haystack )
{
for ( int i = 0; i < haystack.size( ); ++ i)
{
if ( haystack[ i ] == needle )
{
return true;
}
}
return false;
}
/**
* Returns true if @word is a stopword
*
* @param word
* @return bool
*/
bool isStopWord ( string & word )
bool isStopWord ( string word )
{
return ( stopWords.find( word ) != stopWords.end( ) );
......@@ -227,7 +286,7 @@ bool isStopWord ( string & word )
* @param word
* @return string
*/
string toLower ( string & word )
string toLower ( string word )
{
auto iter = word.begin( );
string lowerWord = "";
......@@ -254,7 +313,7 @@ string toLower ( string & word )
* @param word
* @return string
*/
string stemWord ( string & word )
string stemWord ( string word )
{
Stemmer stemmer;
word = stemmer.execute( word );
......@@ -269,7 +328,7 @@ string stemWord ( string & word )
* @param len
* @return string
*/
string subStr ( string & word, size_t pos, size_t len )
string subStr ( string word, size_t pos, size_t len )
{
string substr = "";
for ( int i = 0; i < len; ++i )
......@@ -305,7 +364,7 @@ string subStr ( string::iterator begin, string::iterator end )
* @param chars
* @return string
*/
string stripStr ( string & word, vector< char > chars )
string stripStr ( string word, vector< char > chars )
{
string wordStripped = "";
auto begin = word.begin( );
......@@ -337,7 +396,7 @@ string stripStr ( string & word, vector< char > chars )
* @param chars
* @return string
*/
string stripStr ( string & word )
string stripStr ( string word )
{
string wordStripped = "";
auto begin = word.begin( );
......
......@@ -21,7 +21,7 @@ static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as",
"she",
"some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will",
"with",
"with", "www",
"you", "your" };
/**
......@@ -62,7 +62,25 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer, str
* @param removeSyms
* @return vector< string >
*/
vector< string > splitStr ( string & originalText, char delim, bool removeSyms );
vector< string > splitStr ( string originalText, char delim, bool removeSyms );
/**
* Splits string by multiple delimiters
*
* @param originalText
* @param delims
* @param removeSyms
* @return
*/
vector< string > splitStr ( string originalText, vector < char > delims, bool removeSyms );
/**
* Returns true if element is in array, false otherwise
*
* @param vec
* @return
*/
template <typename T> bool inArray ( T needle, vector < T > haystack );
/**
* Returns true if @word is a stopword
......@@ -70,7 +88,7 @@ vector< string > splitStr ( string & originalText, char delim, bool removeSyms )
* @param word
* @return bool
*/
bool isStopWord ( string & word );
bool isStopWord ( string word );
/**
* Returns lowercase @word
......@@ -78,7 +96,7 @@ bool isStopWord ( string & word );
* @param word
* @return string
*/
string toLower ( string & word );
string toLower ( string word );
/**
* Returns stemmed @word
......@@ -86,7 +104,7 @@ string toLower ( string & word );
* @param word
* @return string
*/
string stemWord ( string & word );
string stemWord ( string word );
/**
* Returns a substring [ post, len )
......@@ -96,7 +114,7 @@ string stemWord ( string & word );
* @param len
* @return string
*/
string subStr ( string & word, size_t pos, size_t len );
string subStr ( string word, size_t pos, size_t len );
/**
* Returns a substring [ begin, end )
......@@ -114,7 +132,7 @@ string subStr ( string::iterator begin, string::iterator end );
* @param chars
* @return string
*/
string stripStr ( string & word, vector< char > chars );
string stripStr ( string word, vector< char > chars );
/**
* Removes all chars from word
......@@ -123,7 +141,7 @@ string stripStr ( string & word, vector< char > chars );
* @param word
* @return string
*/
string stripStr ( string & word );
string stripStr ( string word );
/**
* Returns true is character is a letter
......
......@@ -42,5 +42,5 @@ void testExecute ( string original )
cout << std::endl;
}
delete dict;
dict = nullptr;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment