Skip to content
Snippets Groups Projects
Commit 3841e35d authored by vcday's avatar vcday
Browse files

Merge branch 'parser' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into parser

parents 96342c55 0755de6a
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,10 @@
// Created by Veronica Day on 1/28/18.
//
// keep running count of offset, if stop word: don't incrememnt and remove stopword
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#include <string>
#include <functional>
#include <queue>
......@@ -41,9 +45,10 @@ public:
if (!inFile)
cerr << "Unable to open file datafile.txt";
Tokenizer tokenizer();
parse(inFile, &tokenizer);
parse(inFile);
return tokenizer.get();
......@@ -60,18 +65,25 @@ private:
* @param inFile
* @return
*/
string parse(ifstream inFile)
string parse(ifstream inFile, Tokenizer *tokenizer)
{
//figure out file handle syntax - pointer to file
string word = "";
while (!inFile.eof())
{
inFile >> word;
// checks for url
check_url(word);
// checks for title tags
Tokenizer.execute(check_title(word));
if (!check_url(word))
{
// checks for title tags
tokenizer->execute(check_title(word));
}
}
}
......@@ -80,7 +92,7 @@ private:
* Checks for url in string word
* @param word
*/
void check_url(string &word)
bool check_url(string &word)
{
if (char* pos = strstr("href", word))
{
......@@ -92,12 +104,17 @@ private:
++pos;
while (pos != "\"" && pos != "\'")
{
//filter out everything except http, https
url += *pos;
}
// send it back to the crawler
URL_PQ.push(url);
return true;
}
return false;
}
/**
......
//
// Created by anvia on 1/31/2018.
//
#include <string>
#include <stdio.h>
#include <string.h>
#include <unordered_map>
#include <vector>
#include <stringProcessing.h>
using namespace std;
class Tokenizer
{
public:
Tokenizer() {}
const unordered_map<string, vector<int>> & get() const
{
return &doc_index;
}
void execute(string originalText)
{
int offset = 0;
vector<string> splitText = splitStr(originalText, ' ');
string lowerString = "";
for (int i = 0; i < splitText.size(); ++i)
{
lowerString = toLower(splitText);
if (!isStopWord(lowerString))
{
doc_index[lowerString].push_back(offset);
++offset;
}
}
}
private:
unordered_map<string, vector<int>> doc_index;
};
//
// Created by anvia on 1/31/2018.
//
#ifndef EECS398_SEARCH_STRINGPROCESSING_H
#define EECS398_SEARCH_STRINGPROCESSING_H
#include <string>
#include <unordered_map>
#include <vector>
using namespace std;
//TODO
//remove tag words
set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
vector<string> splitStr(string &originalText, char delim)
{
vector<string> splitWords;
auto begin = originalText.begin();
auto end = originalText.end();
while (begin != end)
{
string word = "";
while (begin != delim && begin != end)
{
word += *begin;
++begin;
}
splitWords.push_back(word);
}
return splitWords;
}
bool isStopWord(string &word)
{
return stopWords.find(word) != nullptr;
}
string toLower(string &word)
{
auto iter = word.begin();
lowerWord = "";
while (iter != word.end())
{
if (*iter < 65 || *iter > 90)
{
lowerWord += (*iter + 32);
}
else
{
lowerWord += *iter;
}
}
return lowerWord;
}
#endif //EECS398_SEARCH_STRINGPROCESSING_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment