Skip to content
Snippets Groups Projects
Commit acee5aa5 authored by aanvi's avatar aanvi
Browse files

Added functionalites

parent a33c0ba5
No related branches found
No related tags found
1 merge request!2WIP:Crawler parser 2 merge into duplicate url-crawler
......@@ -7,12 +7,15 @@
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
// TODO have to read input in as a stream of chars eventually - cat into string?
// TODO different counts: frequency, total num unique words, etc
// TODO handle bad html style (ie no closing p tag)
//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
/*
* Anchor text = #
* Title = *
* Url = @
* Body = %
*/
void Parser::parse ( string html, Tokenizer *tokenizer )
{
......@@ -63,17 +66,13 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
++htmlIt;
}
}
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
bool Parser::isScript ( string word )
/*
* Returns true if script tag, false if not
*/
bool Parser::isScript ( string & word )
{
if ( *findStr ( "<script", word ) != '\0' )
{
......@@ -81,8 +80,11 @@ bool Parser::isScript ( string word )
}
return false;
}
string Parser::extract_body( string word )
/*
* Returns body text if p tags, empty string if not
* If there's no closing tag, stops at the first opening tag or when it hits end of file
*/
string Parser::extract_body( string & word, int & offset )
{
string body = "";
auto foundBody = findStr("<p", word) != '\0';
......@@ -91,11 +93,20 @@ string Parser::extract_body( string word )
while ( *findStr != '<' )
{
body += *findStr;
if ( *findStr == ' ')
{
count += 1;
}
}
}
return body;
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
string Parser::extract_url ( string & word )
{
......
......@@ -32,6 +32,7 @@ public:
* Parser
* @return
*/
// TODO need to change vector type to word data, change where struct is declared
const unordered_map< string, vector< int>> * execute ( Document* document)
{
Tokenizer tokenizer;
......@@ -48,8 +49,6 @@ private:
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void parse ( string html, Tokenizer *tokenizer );
......@@ -68,6 +67,8 @@ private:
*/
string extract_title ( string & word );
bool isScript ( string & word );
string extract_body( string & word );
};
......@@ -27,21 +27,29 @@ public:
return docIndex;
}
//add type of word parameter, ie paragraph, url etc
void execute ( string originalText, int offset )
{
void execute ( string & originalText, int offset )
{
vector< string > splitText = splitStr ( originalText, ' ' );
string lowerString = "";
string processedString = "";
int vectorLength = 0;
for ( int i = 0; i < splitText.size ( ); ++i )
{
lowerString = toLower ( splitText[ i ] );
if ( !isStopWord ( lowerString ) )
for ( int i = 0; i < splitText.size( ); ++i )
{
// case fold
processedString = toLower( splitText[ i ] );
//strip all characters
processedString = stripStr( processedString );
if ( !isStopWord ( lowerString ) )
{
wordData currentWord;
// stem word
processedString = stem.execute( processedString );
wordData currentWord;
currentWord.offset = offset;
vectorLength = ( *docIndex )[ lowerString ].size( );
( *docIndex )[ lowerString ].push_back ( currentWord );
//incrementing frequency value of the current word
( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
++offset;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment