Skip to content
Snippets Groups Projects
Commit 2f0739e4 authored by vcday's avatar vcday
Browse files

changed parser function

parent 910c3ec5
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@
#include "LocalReader.h"
#include "SocketReader.h"
#include "../shared/documentMap.h"
#include "../shared/Document.h"
string Spider::getUrl()
......@@ -12,153 +12,126 @@
#include <queue>
#include <iostream>
#include <fstream>
#include "Tokenizer.h"
#include "../util/Tokenizer.h"
#include "../util/stringProcessing.h"
#include "../shared/Document.h"
#include "../shared/ProducerConsumerQueue.h"
using namespace std;
// Doc Id
std::priority_queue< int > DOCID_PQ;
std::priority_queue< string > URL_PQ;
string PATH = "/doc";
//TEMP - remove once getting actual crawler input
// get doc id from DocIDqueue (sent from crawler)
// go to disk and get the HTML file
// parse the html file
// if find url; send to crawler
// if find title send string to tokenizer
* This class uses the Doc object from the Crawler to parse the text
* Returns a pointer to a dictionary that contains the tokenized input
class Parser
struct raw_data
Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
string url;
string html_data;
raw_data ( string u, string h ) : url ( u ), html_data ( h )
{ }
urlFrontier = urlFrontierIn;
* Parser
* @return
// input: object with char* and URL string
const unordered_map< string, vector< int>> execute ( )
const unordered_map< string, vector< int>> * execute ( Document* document)
Tokenizer tokenizer;
//TEMP - until we get real input from crawler
raw_data data ( "url", "html" );
parse ( data.html_data, &tokenizer );
parse ( document->DocToString (), &tokenizer );
return tokenizer.get ( );
ProducerConsumerQueue < string >* urlFrontier;
* Parses file
* @param inFile
* @return
string parse ( string & html_data, Tokenizer *tokenizer )
void parse ( string html, Tokenizer *tokenizer )
//figure out file handle syntax - pointer to file
string tokenizerInput = "";
string currentTerm = "";
for ( int i = 0; i < html_data.size ( ); ++i )
for ( int i = 0; i < html.size ( ); ++i )
while ( html_data[ i ] != ' ' )
while ( i ) != '\n' )
currentTerm += html_data[ i ];
currentTerm += html[ i ];
//one method that directly adds urls onto frontier instead of checking for them
add_urls ( currentTerm );
check_title ( currentTerm );
tokenizerInput += currentTerm;
string url = extract_url ( currentTerm );
if (url != "")
urlFrontier->Push (url);
string title = extract_title ( currentTerm );
if (title != "")
tokenizerInput += title;
tokenizer->execute ( tokenizerInput );
* Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
* Instead of bool, just directly adds on to url queue
* Returns a url, or "" if none
* @param word
* @return
void add_urls ( string & word )
string extract_url ( string word )
string a_tag = "<a";
string http_start = "href=http";
string http_end_tag = ">";
auto word_iter = word.begin ( );
string url = "";
word_iter = findStr ( word_iter, a_tag );
if ( word_iter != nullptr )
if ( findStr ( word, "<a" ) != '\0' )
auto found_http = findStr ( word_iter, http_start );
if ( found_http != nullptr )
auto foundHttp = findStr ( word, "href=http" );
if ( foundHttp != '\0' )
url = "http";
found_http += 9;
auto end_http = findStr ( word_iter, http_end_tag );
while ( found_http != end_http )
foundHttp += 9;
while ( foundHttp != findStr ( word, "\">" ) )
url += *found_http;
url += *foundHttp;
if ( url != "" )
URL_PQ.push ( url );
return url;
* <title >AJF</title>
* Returns a title, or "" if none
* @param word
* @return
bool check_title ( string & word )
string extract_title ( string & word )
if ( char *pos = strstr ( "<title>", word ) )
string title = "";
auto pos = findStr ( "<title>", word );
if ( pos != '\0')
pos += 6;
auto end_pos = strstr ( "</title>", word );
string title = "";
while ( pos != end_pos )
while ( pos != findStr ( "</title>", word ) )
title += *pos;
return title;
// string begin_title = "<title>";
// auto word_begin = word.begin();
// auto word_iter = findStr(word_begin, begin_title);
return title;
// Created by anvia on 2/6/2018.
#include <string>
#include "../../util/stringProcessing.h"
#include <iostream>
using namespace std;
int main()
string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout."
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
string subStr = "readable";
auto iter = findStr(subStr, original);
cout << *iter << endl;
// Created by Veronica Day on 2/13/18.
// Created by anvia on 2/6/2018.
#include <string>
#include <cassert>
#include <iostream>
#include "../Parser.h"
#include "../../shared/Document.h"
#include "../../shared/ProducerConsumerQueue.h"
using namespace std;
int main ( )
cout << "Testing Parser ... " << endl << endl;
ProducerConsumerQueue < string > * urlFrontierTest;
Document document ( "<!DOCTYPE html>\n"
"<!-- HTML Codes by -->\n"
"Story of Cat</title>\n"
"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n"
"<meta name=\"keywords\" content=\"cat story\">\n"
"<meta name=\"description\" content=\"This is the tale of a cat names joe\">\n"
"body {background-color:#ffffff;background-repeat:no-repeat;background-position:top left;background-attachment:fixed;}\n"
"h1{font-family:Arial, sans-serif;color:#000000;background-color:#ffffff;}\n"
"p {font-family:Georgia, serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;}\n"
"<h1>Joe the cat</h1>\n"
"<p>On Saturday, joe the cat went to the store. He climbed up a mountain? It was weird. The store was called Food Store</p>\n"
"</html>" );
Parser parser ( urlFrontierTest );
auto dictionary = parser.execute ( &document );
assert( dictionary != nullptr );
cout << "Parser Tests Passed! :D" << endl;
// Created by Jake Close on 2/8/18.
#pragma once
#include "url.h"
#include <string>
#include <vector>
#include <pthread.h>
using namespace std;
namespace filepath
const char *DOC_MAP = "/docMap.txt";
pthread_mutex_t docMap_mutex = PTHREAD_MUTEX_INITIALIZER;
class Document
ParsedUrl url;
long docID;
bool lastCrawlStatus;
int lastCrawlDate;
int lastCrawlPageCount;
//add more info fields here
Document ( string url_in ) : url ( ParsedUrl ( url_in ) )
{ }
string DocToString ( )
return string ( url.CompleteUrl, strlen ( url.CompleteUrl ) ) + "\n";
int WriteToDocMap ( )
pthread_mutex_lock ( &docMap_mutex );
//for now just write url
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "W" );
off_t resultPosition = 0;
//check if its available
if ( file == -1 )
throw ( "error opening docMap" );
//get the current size of the docMap
size_t seekPosition = util::FileSize ( file );
//seek to the end of the file
resultPosition = lseek ( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
throw ( "Could not seek" );
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write ( file, this->DocToString ( ).c_str ( ),
strlen ( this->DocToString ( ).c_str ( ) ) );
if ( success == -1 )
throw ( "Error writing document object to document map" );
catch ( const char *str )
cerr << str << endl;
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return -1;
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return resultPosition;
static void PrintDocMap ( string url, int location )
pthread_mutex_lock ( &docMap_mutex );
std::cout << url << " is " << location;
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "R" );
//check if its available
if ( file )
off_t resultPosition = lseek ( file, ( size_t ) location, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read ( file, buffer, bytes ) )
write ( 1, buffer, bytesRead );
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
pthread_mutex_unlock ( &docMap_mutex );
pthread_mutex_unlock ( &docMap_mutex );
\ No newline at end of file
// Created by Jake Close on 2/8/18.
#pragma once
#include "url.h"
#include <string>
#include <vector>
#include <pthread.h>
using namespace std;
namespace filepath
const char* DOC_MAP = "/docMap.txt";
pthread_mutex_t docMap_mutex = PTHREAD_MUTEX_INITIALIZER;
class Document
ParsedUrl url;
long docID;
bool lastCrawlStatus;
int lastCrawlDate;
int lastCrawlPageCount;
//add more info fields here
Document(string url_in) : url(ParsedUrl(url_in)) {}
string DocToString()
return string(url.CompleteUrl, strlen(url.CompleteUrl)) + "\n";
int WriteToDocMap()
//for now just write url
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor(loc.c_str(), "W");
off_t resultPosition = 0;
try {
//check if its available
if (file == -1) {
throw("error opening docMap");
} else {
//get the current size of the docMap
size_t seekPosition = util::FileSize(file);
//seek to the end of the file
resultPosition = lseek(file, seekPosition, SEEK_SET);
if (resultPosition == -1) {
throw("Could not seek");
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write(file, this->DocToString().c_str(), strlen(this->DocToString().c_str()));
if (success == -1) {
throw("Error writing document object to document map");
catch(const char* str){
cerr << str << endl;
return -1;
close( file );
return resultPosition;
static void PrintDocMap(string url, int location)
std::cout << url << " is " << location;
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor( loc.c_str(), "R" );
//check if its available
if ( file )
off_t resultPosition = lseek( file, (size_t)location, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read( file, buffer, bytes ))
write( 1, buffer, bytesRead );
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
\ No newline at end of file
// Created by Veronica Day on 2/22/18.
class Stemmer
......@@ -5,7 +5,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "../util/stringProcessing.h"
#include "stringProcessing.h"
using namespace std;
......@@ -13,46 +13,56 @@
using namespace std;
* Takes in an iterator to the original text and a substring: specifically for a parser functionality
* Potentially make one that takes in two strings? Is this needed?
* Set of stopwords
set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
"for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
"some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
"you", "your" };
* Finds the needle in the haystack
* @param haystack
* @param needle
* @return
string::iterator findStr ( string::iterator originalText, string & subStr )
string::iterator findStr ( string haystack, string needle )
auto begin_sub = subStr.begin ( );
auto begin_original = originalText;
auto beginNeedle = needle.begin ( );
auto beginHaystack = haystack.begin();
while ( *begin_original != '\0' ) //*(forward++) != '\0'
while ( *beginHaystack != '\0' )
//keep looking for instance of a match
if ( *begin_original != *begin_sub )
if ( *beginHaystack != *beginNeedle )
else if ( *begin_original == *begin_sub )
else if ( *beginHaystack == *beginNeedle )
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = begin_original;
while ( *temp == *begin_sub )
auto temp = beginHaystack;
while ( *temp == *beginNeedle )
//if it hits the end of the substring, it signifies an exact match
if ( *begin_sub == '\0' )
//if it hits the end of the needleing, it signifies an exact match
if ( *beginNeedle == '\0' )
//this is pointing at the beginning of the match
return begin_original;
return beginHaystack;
//need to reset because still has to search rest of the string for a match
begin_sub = subStr.begin ( );
beginNeedle = needle.begin ( );
//sets the original text pointer to where the last search left off
begin_original = temp;
beginHaystack = temp;
......@@ -61,18 +71,19 @@ string::iterator findStr ( string::iterator originalText, string & subStr )
return begin_original;
return beginHaystack;
set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
"for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
"some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
"you", "your" };
vector< string > splitStr ( string & originalText, char delim )
* Returns a vector of strings from @originalText, split by @delim
* @param originalText
* @param delim
* @return
vector< string > splitStr ( string originalText, char delim )
vector< string > splitWords;
auto begin = originalText.begin ( );
......@@ -93,14 +104,22 @@ vector< string > splitStr ( string & originalText, char delim )
return splitWords;
bool isStopWord ( string & word )
* Returns true if @word is a stopword
* @param word
* @return
bool isStopWord ( string word )
return ( stopWords.find ( word ) != stopWords.end ( ) );
string toLower ( string & word )
* Returns lowercase @word
* @param word
* @return
string toLower ( string word )
auto iter = word.begin ( );
string lowerWord = "";
......@@ -121,4 +140,15 @@ string toLower ( string & word )
return lowerWord;
* Returns stemmed @word
* @param word
* @return
string stemWord(string word)
return "";
// Created by Veronica Day on 2/22/18.
......@@ -4,19 +4,19 @@
#include <string>
#include <vector>
#include "../../util/stringProcessing.h"
#include "../stringProcessing.h"
#include <iostream>
#include <cassert>
using namespace std;
void test_findStr ( string original );
void testFindStr ( string original );
void test_splitStr ( string original );
void testSplitStr ( string original );
void test_toLower ( );
void testToLower ( );
void test_isStopWord ( );
void testIsStopWord ( );
int main ( )
......@@ -27,66 +27,46 @@ int main ( )
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
test_findStr ( original );
test_splitStr ( original );
test_toLower ( );
test_isStopWord ( );
testFindStr ( original );
testSplitStr ( original );
testToLower ( );
testIsStopWord ( );
cout << "\nTests passed for StringProcessing_unit :D" << endl;
void test_findStr ( string original )
void testFindStr ( string original )
cout << "Testing findStr..." << endl;
string find = "established";
auto word = findStr ( original.begin ( ), find );
assert( *word == 'e' );
find = "Lorem Ipsum";
auto word2 = findStr ( original.begin ( ), find );
assert( *word2 == 'L' );
assert( *findStr ( original, "established" ) == 'e' );
assert( *findStr ( original, "Lorem Ipsum" ) == 'L' );
string title = "<title> This is a test </title>";
find = "<title>";
auto word3 = findStr ( title.begin ( ), find );
assert( *word3 == '<' );
auto word = findStr ( title, "<title>" );
assert( *word == '<' );
auto titleIt = title.begin ( );
while ( word3 != title.end ( ) && titleIt != title.end ( ) )
while ( word != title.end ( ) && titleIt != title.end ( ) )
assert( *word3 == *titleIt );
assert( *word == *titleIt );
find = "</title>";
auto word4 = findStr ( title.begin ( ), find );
assert( *word4 == '<' && *( word4 + 1 ) == '/' );
auto word0 = findStr ( original.begin ( ), find );
assert( *word0 == '\0' );
find = "orange";
auto word5 = findStr ( original.begin ( ), find );
assert( *word5 == '\0' );
find = "orange";
string test = "apple";
auto word7 = findStr ( test.begin ( ), find );
assert( *word7 == '\0' );
find = "bird";
test = "bigbird";
auto word6 = findStr ( test.begin ( ), find );
assert( *word6 == 'b' && *( word6 + 1 ) == 'i' && *( word6 + 2 ) == 'r' );
auto word1 = findStr ( title, "</title>" );
assert( *word1 == '<' && *( word1 + 1 ) == '/' );
assert( *findStr ( original, "</title>" ) == '\0' );
assert( *findStr ( original, "orange" ) == '\0' );
assert( *findStr ( "apple", "orange" ) == '\0' );
auto word2 = findStr ( "bigbird", "bird" );
assert( *word2 == 'b' && *( word2 + 1 ) == 'i' && *( word2 + 2 ) == 'r' );
cout << "test_findStr passed" << endl;
cout << "testFindStr passed" << endl;
void test_splitStr ( string original )
void testSplitStr ( string original )
cout << "Testing splitStr..." << endl;
......@@ -98,12 +78,12 @@ void test_splitStr ( string original )
assert( vec.size ( ) == 2 );
assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
cout << "test_splitStr passed" << endl;
cout << "testSplitStr passed" << endl;
void test_toLower ( )
void testToLower ( )
cout << "Testing toLower..." << endl;
......@@ -126,11 +106,11 @@ void test_toLower ( )
assert ( test4 == "" );
assert ( test5 == " " );
cout << "test_toLower passed" << endl;
cout << "testToLower passed" << endl;
void test_isStopWord ( )
void testIsStopWord ( )
cout << "Testing isStopWord..." << endl;
......@@ -146,6 +126,6 @@ void test_isStopWord ( )
assert ( !isStopWord ( blank ) );
assert ( !isStopWord ( blank2 ) );
cout << "test_isStopWord passed" << endl;
cout << "testIsStopWord passed" << endl;
\ No newline at end of file
......@@ -10,7 +10,7 @@
using namespace std;
void test_execute ( string original );
void testExecute ( string original );
int main ( )
......@@ -22,18 +22,18 @@ int main ( )
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
test_execute ( original );
testExecute ( original );
cout << "\nTests passed for TokenizerTest_unit :D" << endl;
void test_execute ( string original )
void testExecute ( string original )
Tokenizer my_tokenizer;
my_tokenizer.execute ( original );
Tokenizer myTokenizer;
myTokenizer.execute ( original );
auto dict = my_tokenizer.get ( );
auto dict = myTokenizer.get ( );
for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment