Skip to content
Snippets Groups Projects
Commit 0755de6a authored by aanvi's avatar aanvi
Browse files

Added tokenizer

parent 1e1d50c5
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,10 @@
// Created by Veronica Day on 1/28/18.
//
// keep running count of offset, if stop word: don't incrememnt and remove stopword
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#include <string>
#include <functional>
#include <queue>
......@@ -41,9 +45,10 @@ public:
if (!inFile)
cerr << "Unable to open file datafile.txt";
Tokenizer tokenizer();
parse(inFile, &tokenizer);
parse(inFile);
return tokenizer.get();
......@@ -60,18 +65,25 @@ private:
* @param inFile
* @return
*/
string parse(ifstream inFile)
string parse(ifstream inFile, Tokenizer *tokenizer)
{
//figure out file handle syntax - pointer to file
string word = "";
while (!inFile.eof())
{
inFile >> word;
// checks for url
check_url(word);
// checks for title tags
Tokenizer.execute(check_title(word));
if (!check_url(word))
{
// checks for title tags
tokenizer->execute(check_title(word));
}
}
}
......@@ -80,7 +92,7 @@ private:
* Checks for url in string word
* @param word
*/
void check_url(string &word)
bool check_url(string &word)
{
if (char* pos = strstr("href", word))
{
......@@ -92,12 +104,17 @@ private:
++pos;
while (pos != "\"" && pos != "\'")
{
//filter out everything except http, https
url += *pos;
}
// send it back to the crawler
URL_PQ.push(url);
return true;
}
return false;
}
/**
......
//
// Created by anvia on 1/31/2018.
//
#include <string>
#include <stdio.h>
#include <string.h>
#include <unordered_map>
#include <vector>
#include <stringProcessing.h>
using namespace std;
class Tokenizer
{
public:
Tokenizer() {}
const unordered_map<string, vector<int>> & get() const
{
return &doc_index;
}
void execute(string originalText)
{
int offset = 0;
vector<string> splitText = splitStr(originalText, ' ');
string lowerString = "";
for (int i = 0; i < splitText.size(); ++i)
{
lowerString = toLower(splitText);
if (!isStopWord(lowerString))
{
doc_index[lowerString].push_back(offset);
++offset;
}
}
}
private:
unordered_map<string, vector<int>> doc_index;
};
//
// Created by anvia on 1/31/2018.
//
#ifndef EECS398_SEARCH_STRINGPROCESSING_H
#define EECS398_SEARCH_STRINGPROCESSING_H
#include <string>
#include <unordered_map>
#include <vector>
using namespace std;
//TODO
//remove tag words
set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
vector<string> splitStr(string &originalText, char delim)
{
vector<string> splitWords;
auto begin = originalText.begin();
auto end = originalText.end();
while (begin != end)
{
string word = "";
while (begin != delim && begin != end)
{
word += *begin;
++begin;
}
splitWords.push_back(word);
}
return splitWords;
}
bool isStopWord(string &word)
{
return stopWords.find(word) != nullptr;
}
string toLower(string &word)
{
auto iter = word.begin();
lowerWord = "";
while (iter != word.end())
{
if (*iter < 65 || *iter > 90)
{
lowerWord += (*iter + 32);
}
else
{
lowerWord += *iter;
}
}
return lowerWord;
}
#endif //EECS398_SEARCH_STRINGPROCESSING_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment