Skip to content
Snippets Groups Projects
Commit 22d29b60 authored by aanvi's avatar aanvi
Browse files

Added body parsing

parent c1880624
No related branches found
No related tags found
1 merge request!2WIP:Crawler parser 2 merge into duplicate url-crawler
......@@ -6,9 +6,9 @@
* @param urlFrontierIn
*/
Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn )
{
urlFrontier = urlFrontierIn;
}
{
urlFrontier = urlFrontierIn;
}
/**
......@@ -16,215 +16,160 @@ Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn )
* @return
*/
const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document )
{
Tokenizer tokenizer;
parse( document->DocToString( ), document->getUrl( ), &tokenizer );
return tokenizer.get( );
}
{
Tokenizer tokenizer;
parse( document->DocToString( ), document->getUrl( ), &tokenizer );
return tokenizer.get( );
}
/**
* Parses file
* @param inFile
* @return
*/
<<<<<<< HEAD
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
// TODO different counts: frequency, total num unique words, etc
//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
/*
* Anchor text = #
* Title = *
* Url = @
* Body = %
*/
void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
{
void Parser::parse ( string html, Tokenizer *tokenizer )
{
unsigned long htmlIt = 0;
unsigned long offsetTitle = 0;
unsigned long offsetBody = 0;
unsigned long offsetURL = 0;
//maybe add some code to read in stream and add chars to string as they come in
auto htmlIt = html.begin();
int offset = 0;
while (htmlIt != html.end())
=======
void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
{
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string urlCurrent = host + "/" + path;
unsigned long htmlIt = 0;
unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string urlCurrent = host + "/" + path;
while ( htmlIt < html.size( ) )
{
unsigned long begCloseTag = 0;
bool isParagraph = false;
unsigned long savePosition = htmlIt;
// if open bracket
if ( html[ htmlIt ] == '<' )
{
offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
if ( html[ htmlIt + 1 ] == 'p' && ( ( html[htmlIt + 2]) == '>' || ( html[ htmlIt + 2 ] == ' ') ) )
{
begCloseTag = findNext( "</p>", htmlIt, html );
isParagraph = true;
}
else
{
begCloseTag = findNext( "</", htmlIt, html );
}
while ( htmlIt < html.size( ) )
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
{
// if open bracket
if ( html[ htmlIt ] == '<' )
{
<<<<<<< HEAD
// TODO have to put a conditional that ensures the opening and closing tags are the same type
auto begCloseTag = findNext ("</", htmlIt);
auto endCloseTag = findNext ( ">", begCloseTag);
string line (htmlIt, endCloseTag + 1);
=======
unsigned long begCloseTag = findNext( "</", htmlIt, html );
unsigned long endCloseTag = findNext( ">", begCloseTag, html );
string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt );
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
htmlIt = endCloseTag + 2;
unsigned long endCloseTag = findNext( ">", begCloseTag, html );
string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt );
htmlIt = endCloseTag + 2;
//check if line is a script
if ( isScript( line ) )
{
// DO NOTHING
}
// check if line is url
<<<<<<< HEAD
else if ( url = extract_url( line ) != "" )
{
//where is urlFrontier defined?
urlFrontier->push ( url );
}
// check if line is title
else if ( title = extract_title( line ) != "" )
{
tokenizer->execute ( title, offset );
=======
string url = extract_url( line );
if ( url != "" )
{
if ( isLocal( url ) )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
}
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
}
else if ( body = extract_body( line ) != "")
// check if line is url
string title = extract_title( line );
string url = extract_url( line );
string header = extract_header( line );
//checking if html line is script
if ( isTag( line, "script" ) )
{
//DO NOTHING
}
//checking for p tag
else if ( isParagraph )
{
string body = extract_body( line, offsetTitle, offsetBody, isParagraph, tokenizer, currentUrl, urlCurrent );
offsetBody = tokenizer->execute( body, offsetBody, Tokenizer::BODY );
}
//if html line is url, parses accordingly and pushes to frontier
else if ( url != "" )
{
if ( isLocal( url ) )
{
tokenizer->execute( body, offset );
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
else
{
<<<<<<< HEAD
//DO NOTHING
=======
string title = extract_title( line );
if ( title != "" )
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
}
}
else
{
++htmlIt;
}
}
}
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
// urlFrontier->Push( pUrl );
cout << url << endl;
}
}
//check if line is header; classifies as body text
else if ( header != "")
{
offsetBody = tokenizer->execute( header, offsetBody, Tokenizer::BODY );
}
// check if line is title
else if ( title != "")
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
/*
* Returns true if script tag, false if not
*/
bool Parser::isScript ( string & word )
{
if ( *findStr ( "<script", word ) != '\0' )
{
return true;
}
return false;
}
/*
* Returns body text if p tags, empty string if not
* If there's no closing tag, stops at the first opening tag or when it hits end of file
*/
string Parser::extract_body( string & word, int & offset )
{
string body = "";
auto foundBody = findStr("<p", word) != '\0';
if ( *foundBody != '\0' )
{
while ( *findStr != '<' )
else
{
body += *findStr;
if ( *findStr == ' ')
{
count += 1;
}
//DO NOTHING
}
}
return body;
else
{
++htmlIt;
}
}
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
<<<<<<< HEAD
string Parser::extract_url ( string & word )
=======
string Parser::extract_url ( string html )
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
{
string url = "";
if ( findStr( "<a", html ) != html.size( ) )
{
unsigned long foundHref = findStr( "href", html );
unsigned long foundHttp = findNext( "http", foundHref, html );
if ( foundHttp < html.size( ) )
{
url = "";
unsigned long closeTag = findNext( ">", foundHref, html );
unsigned long closeSpace = findNext( " ", foundHref, html );
unsigned long closeUrl = 0;
// end == ' >'
if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
{
if ( html[ closeSpace - 1 ] == '\"' )
{
closeSpace -= 1;
}
closeUrl = closeSpace;
}
// end == '>'
else if ( closeTag < html.size( ) )
{
if ( html[ closeTag - 1 ] == '\"' )
{
closeTag -= 1;
}
closeUrl = closeTag;
}
{
string url = "";
if ( findStr( "<a", html ) != html.size( ) )
{
unsigned long foundHref = findStr( "href", html );
unsigned long foundHttp = findNext( "http", foundHref, html );
if ( foundHttp < html.size( ) )
{
url = "";
unsigned long closeTag = findNext( ">", foundHref, html );
unsigned long closeSpace = findNext( " ", foundHref, html );
unsigned long closeUrl = 0;
// end == ' >'
if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
{
if ( html[ closeSpace - 1 ] == '\"' )
{
closeSpace -= 1;
}
closeUrl = closeSpace;
}
// end == '>'
else if ( closeTag < html.size( ) )
{
if ( html[ closeTag - 1 ] == '\"' )
{
closeTag -= 1;
}
closeUrl = closeTag;
}
while ( foundHttp != closeUrl && html[ foundHttp ] != '\n')
{
url.push_back( html[ foundHttp ] );
++foundHttp;
}
}
}
while ( foundHttp != closeUrl && html[ foundHttp ] != '\n')
{
url.push_back( html[ foundHttp ] );
++foundHttp;
}
}
}
return url;
}
return url;
}
/**
* Returns a title, or "" if none
......@@ -232,21 +177,21 @@ string Parser::extract_url ( string html )
* @return
*/
string Parser::extract_title ( string html )
{
string title = "";
char end = '<';
auto pos = findStr( "<title>", html );
if ( pos < html.size( ) )
{
pos += 7;
while ( html[ pos ] != end )
{
title += html[ pos ];
++pos;
}
}
return title;
}
{
string title = "";
char end = '<';
auto pos = findStr( "<title>", html );
if ( pos < html.size( ) )
{
pos += 7;
while ( html[ pos ] != end )
{
title += html[ pos ];
++pos;
}
}
return title;
}
/**
* Will return true if local url
......@@ -255,9 +200,9 @@ string Parser::extract_title ( string html )
* @return
*/
bool Parser::isLocal ( string url )
{
return ( url[ 0 ] == '/' );
}
{
return ( url[ 0 ] == '/' );
}
/**
* Returns false if the link is an invalid type
......@@ -266,28 +211,163 @@ bool Parser::isLocal ( string url )
* @return
*/
bool Parser::isValid ( string url )
{
unsigned long size = url.size( );
{
unsigned long size = url.size( );
string lastFive = lastN( url, 5 );
string lastFour = lastN( url, 4 );
// .html
if ( lastFive == ".html" )
{
return true;
}
// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif"
|| lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" )
{
return false;
}
//jpeg
if ( lastFive == ".jpeg" )
{
return false;
}
return true;
}
//TODO delete?? may not need
void Parser::remove_tag( string & html, unsigned long & htmlIt, unsigned long savePosition, string tag)
{
unsigned long openTag = findStr( "<" + tag + ">", html );
unsigned long closeTag = findNext( "</" + tag + ">", openTag, html );
//TODO write erase functions??
html.erase( closeTag, tag.length( ) + 2 );
html.erase( openTag, tag.length( ) + 3 );
htmlIt = savePosition;
}
void Parser::extract_all ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
ParsedUrl & currentUrl, string & urlCurrent )
{
// check if line is url
string title = extract_title( line );
string url = extract_url( line );
//checking if html line is script
if ( isTag( line, "script" ) )
{
//DO NOTHING
}
//TODO delete this conditional if keeping whats in main right now
else if ( isParagraph )
{
string body = extract_body( line, offsetTitle, offsetBody, isParagraph, tokenizer, currentUrl, urlCurrent );
offsetBody = tokenizer->execute( body, offsetBody, Tokenizer::BODY );
}
else if ( url != "" )
{
if ( isLocal( url ) )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
// urlFrontier->Push( pUrl );
cout << url << endl;
}
}
// check if line is title
// check if line is title
else if ( title != "")
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
else
{
//DO NOTHING
}
}
/**
* Returns true if tag is in html, false if not
* @param html
* @return
*/
bool Parser::isTag( string html, string tag )
{
string findTag = "<" + tag;
if ( findStr( findTag, html ) != html.size( ) )
{
return true;
}
return false;
}
string lastFive = lastN( url, 5 );
string lastFour = lastN( url, 4 );
string Parser::extract_body( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
ParsedUrl & currentUrl, string & urlCurrent )
{
string body = "";
unsigned long startParTag = findNext( "<p>", 0, html );
unsigned long closeParTag = findNext( "</p>", startParTag, html );
unsigned long nextCloseTag = findNext( "</", startParTag, html );
startParTag += 3;
while ( nextCloseTag != startParTag )
{
if ( closeParTag == nextCloseTag )
{
while ( startParTag != closeParTag )
{
body += html[ startParTag ];
++startParTag;
if ( startParTag >= html.size( ))
{
return body;
}
}
}
else
{
unsigned long newHtmlStart = findNext ( "<", startParTag, html );
char a = html[ newHtmlStart ];
unsigned long closeNewHtml = findNext ( ">", newHtmlStart, html );
char b = html[ closeNewHtml ];
unsigned long newHtmlTagLength = closeNewHtml - newHtmlStart;
while ( startParTag != newHtmlStart )
{
body += html[ startParTag ];
++startParTag;
}
string newHtml = subStr(html, newHtmlStart, nextCloseTag - newHtmlStart + newHtmlTagLength + 2);
extract_all( newHtml, offsetTitle, offsetBody, false, tokenizer, currentUrl, urlCurrent);
startParTag = nextCloseTag + newHtmlTagLength + 2;
nextCloseTag = findNext( "</", startParTag, html );
}
}
// .html
if ( lastFive == ".html" )
{
return true;
}
return body;
}
// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif"
|| lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" )
{
return false;
}
//jpeg
if ( lastFive == ".jpeg" )
{
return false;
}
return true;
}
string Parser::extract_header( string html )
{
string header = "";
unsigned long startHeader = findStr( "<h", html );
if ( startHeader != html.size( ) && ( html[ startHeader + 1] >= '1' && html[ startHeader + 1 ] <= '6' ) )
{
unsigned long endHeader = findNext( "</h", startHeader, html );
startHeader += 4;
while ( startHeader != endHeader )
{
header += html[ startHeader ];
++startHeader;
}
}
return header;
}
......@@ -19,92 +19,79 @@ using namespace std;
* Returns a pointer to a dictionary that contains the tokenized input
*/
class Parser
{
{
public:
<<<<<<< HEAD
Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
{
urlFrontier = urlFrontierIn;
}
=======
/**
* Parser Cstor
* @param urlFrontierIn
*/
Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn );
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Executes the Parser
* @return
*/
<<<<<<< HEAD
// TODO need to change vector type to word data, change where struct is declared
const unordered_map< string, vector< Tokenizer::wordData>> * execute ( Document* document)
{
Tokenizer tokenizer;
parse ( document->DocToString (), &tokenizer );
return tokenizer.get ( );
}
=======
const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Parser Cstor
* @param urlFrontierIn
*/
Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn );
/**
* Executes the Parser
* @return
*/
const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
private:
ProducerConsumerQueue< ParsedUrl > *urlFrontier;
/**
* Parses file
* @param inFile
* @return
*/
<<<<<<< HEAD
void parse ( string html, Tokenizer *tokenizer );
=======
void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Returns a url, or "" if none
* @param html
* @return
*/
string extract_url ( string html );
/**
* Returns a title, or "" if none
* @param html
* @return
*/
string extract_title ( string html );
<<<<<<< HEAD
bool isScript ( string & word );
string extract_body( string & word );
=======
/**
* Will return true if local url
*
* @param url
* @return
*/
bool isLocal ( string url );
/**
* Returns true is url is valid
*
* @param url
* @return
*/
bool isValid ( string url );
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
};
ProducerConsumerQueue< ParsedUrl > *urlFrontier;
/**
* Parses file
* @param inFile
* @return
*/
void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
/**
* Returns a url, or "" if none
* @param html
* @return
*/
string extract_url ( string html );
/**
* Returns a title, or "" if none
* @param html
* @return
*/
string extract_title ( string html );
/**
* Will return true if local url
*
* @param url
* @return
*/
bool isLocal ( string url );
/**
* Returns true is url is valid
*
* @param url
* @return
*/
bool isValid ( string url );
bool isTag( string html, string tag );
string extract_body( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
ParsedUrl & currentUrl, string & urlCurrent );
void extract_all ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
ParsedUrl & currentUrl, string & urlCurrent );
//TODO delete?? may not need
void remove_tag( string & html, unsigned long & htmlIt, unsigned long savePosition, string tag);
string extract_header( string html );
};
#include <string>
#include <cstring>
#include <cassert>
#include <iostream>
#include "../Parser.h"
......@@ -14,6 +15,10 @@ void testComplex ( );
void testURL ( );
void testExtractBody( );
void testBody( );
int main ( )
{
cout << "Testing Parser ... " << endl << endl;
......@@ -26,11 +31,13 @@ int main ( )
cout << "Testing Complex: " << endl;
testComplex( );
cout << "Complex Test Passed!" << endl;
cout << "Parser Tests Passed! :D" << endl;
}
cout << "Testing BODY: " << endl;
testExtractBody( );
testBody( );
cout << "Parser Tests Passed! :D" << endl;
}
void testSimple ( )
void testSimple( )
{
ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
......@@ -143,4 +150,93 @@ void testURL ( )
delete dictionary;
dictionary = nullptr;
}
\ No newline at end of file
}
void testBody( )
{
ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
ParsedUrl url = ParsedUrl( "http://www.testurl.com" );
char docString[1024];
strcpy( docString, "<!DOCTYPE html>\n"
"<html>\n"
"<head>\n"
"<!-- HTML Codes by Quackit.com -->\n"
"<title>\n"
"Story of Cat</title>\n"
"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n"
"<meta name=\"keywords\" content=\"cat story\">\n"
"<meta name=\"description\" content=\"This is the tale of a cat names joe\">\n"
"<style>\n"
"body {background-color:#ffffff;background-repeat:no-repeat;background-position:top left;background-attachment:fixed;}\n"
"h1{font-family:Arial, sans-serif;color:#000000;background-color:#ffffff;}\n"
"p {font-family:Georgia, serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;}\n"
"</style>\n"
"</head>\n"
"<body>\n"
"<h1>Joe the cat</h1>\n"
"<p>On Saturday, joe the cat went to the store. He climbed up a mountain? It was weird. The store was called Food Store</p>\n"
"</body>\n"
"</html>" );
Document document( url, docString );
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
cout << dictionary->size( ) << endl;
//assert( dictionary->size( ) == 4);
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
}
void testExtractBody ( )
{
ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
ParsedUrl url = ParsedUrl( "http://www.testurl.com" );
char docString[1024];
strcpy( docString, "<title>Paragraph body text hello</title>" );
Document document( url, docString );
Parser parser( &urlFrontierTest );
auto dictionary = parser.execute( &document );
cout << dictionary->size( ) << endl;
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
cout << endl << endl;
assert( dictionary->size( ) == 6);
char docString2[1024];
strcpy( docString2, "<p>Paragraph body text hello <title>Specific title</title> more body words</p>" );
Document document2( url, docString2 );
Parser parser2 ( &urlFrontierTest );
dictionary = parser.execute( &document2 );
cout << "Dictionary 2 size " << dictionary->size( ) << endl;
for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
cout << std::endl;
}
assert( dictionary->size( ) == 10);
assert( dictionary->at( "#specif" )[0] == 0);
assert( dictionary->at("%paragraph")[0] == 0);
assert( dictionary->at("%bodi")[1] == 5);
}
#pragma once
#include <string>
......@@ -6,111 +5,57 @@
#include <vector>
#include "stringProcessing.h"
#include "Stemmer.h"
#include "../parser/Parser.h"
using namespace std;
class Tokenizer
{
{
public:
struct wordData {
int frequency = 0;
int offset;
};
Tokenizer ( )
{
docIndex = new unordered_map< string, vector<wordData>>;
}
unordered_map< string, vector< wordData>> *get ( ) const
{
return docIndex;
}
//add type of word parameter, ie paragraph, url etc
void execute ( string & originalText, int offset )
{
vector< string > splitText = splitStr ( originalText, ' ' );
string processedString = "";
int vectorLength = 0;
for ( int i = 0; i < splitText.size( ); ++i )
{
// case fold
processedString = toLower( splitText[ i ] );
//strip all characters
processedString = stripStr( processedString );
if ( !isStopWord ( lowerString ) )
{
// stem word
processedString = stem.execute( processedString );
wordData currentWord;
currentWord.offset = offset;
vectorLength = ( *docIndex )[ lowerString ].size( );
( *docIndex )[ lowerString ].push_back ( currentWord );
//incrementing frequency value of the current word
( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
++offset;
}
}
}
// decorators
static const char TITLE = '#';
static const char ANCHOR = '@';
static const char URL = '$';
/**
* Tokenizer Cstor
*/
Tokenizer ( );
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
<<<<<<< HEAD
unordered_map< string, vector<wordData>> *get ( ) const;
=======
unordered_map< string, vector< unsigned long > > *get ( ) const;
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
// decorators
static const char TITLE = '#';
static const char ANCHOR = '@';
static const char URL = '$';
static const char BODY = '%';
/**
* Tokenizer Cstor
*/
Tokenizer ( );
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map< string, vector< unsigned long > > *get ( ) const;
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
private:
unordered_map< string, vector< unsigned long > > *docIndex;
Stemmer stem;
unordered_map< string, vector< unsigned long > > *docIndex;
Stemmer stem;
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
<<<<<<< HEAD
private:
unordered_map< string, vector<wordData>> *docIndex;
Stemmer stem;
=======
>>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
};
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment