Added tokenizer

0755de6a · aanvi · 1e1d50c5 · 0755de6a · 0755de6a · 0755de6a
Commit 0755de6a authored 7 years ago by aanvi
--- a/Parser.cpp
+++ b/Parser.cpp
@@ -2,6 +2,10 @@
 // Created by Veronica Day on 1/28/18.
 //

+// keep running count of offset, if stop word: don't incrememnt and remove stopword
+// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
+//
+
 #include <string>
 #include <functional>
 #include <queue>
@@ -41,9 +45,10 @@ public:
 		if (!inFile)
 			cerr << "Unable to open file datafile.txt";

+        Tokenizer tokenizer();
+		parse(inFile, &tokenizer);

-		parse(inFile);
-
+        return tokenizer.get();



@@ -60,18 +65,25 @@ private:
 	  * @param inFile
 	  * @return
 	  */
-	 string parse(ifstream inFile)
+
+	 string parse(ifstream inFile, Tokenizer *tokenizer)
 	 {
+		 //figure out file handle syntax - pointer to file
 		 string word = "";
 		 while (!inFile.eof())
 		 {
 			 inFile >> word;

 			 // checks for url
-			 check_url(word);

-			 // checks for title tags
-			 Tokenizer.execute(check_title(word));
+			 if (!check_url(word))
+             {
+                 // checks for title tags
+                 tokenizer->execute(check_title(word));
+
+             }
+
+
 		 }

 	 }
@@ -80,7 +92,7 @@ private:
 	  * Checks for url in string word
 	  * @param word
 	  */
-	 void check_url(string &word)
+	 bool check_url(string &word)
 	 {
 		 if (char* pos = strstr("href", word))
 		 {
@@ -92,12 +104,17 @@ private:
 			 ++pos;
 			 while (pos != "\"" && pos != "\'")
 			 {
+				 //filter out everything except http, https
 				 url += *pos;
 			 }

 			 // send it back to the crawler
 			 URL_PQ.push(url);
+             return true;
+
 		 }
+
+         return false;
 	 }

 	 /**

--- a/Tokenizer.cpp
+++ b/Tokenizer.cpp
+//
+// Created by anvia on 1/31/2018.
+//
+
+#include <string>
+#include <stdio.h>
+#include <string.h>
+#include <unordered_map>
+#include <vector>
+#include <stringProcessing.h>
+using namespace std;
+
+class Tokenizer
+{
+public:
+    Tokenizer() {}
+
+    const unordered_map<string, vector<int>> & get() const
+    {
+        return &doc_index;
+    }
+
+    void execute(string originalText)
+    {
+        int offset = 0;
+        vector<string> splitText = splitStr(originalText, ' ');
+        string lowerString = "";
+        for (int i = 0; i < splitText.size(); ++i)
+        {
+            lowerString = toLower(splitText);
+            if (!isStopWord(lowerString))
+            {
+                doc_index[lowerString].push_back(offset);
+                ++offset;
+            }
+        }
+    }
+
+private:
+    unordered_map<string, vector<int>> doc_index;
+};
--- a/stringProcessing.h
+++ b/stringProcessing.h
+//
+// Created by anvia on 1/31/2018.
+//
+
+#ifndef EECS398_SEARCH_STRINGPROCESSING_H
+#define EECS398_SEARCH_STRINGPROCESSING_H
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using namespace std;
+
+//TODO
+//remove tag words
+
+set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
+                         "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
+                         "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
+
+vector<string> splitStr(string &originalText, char delim)
+{
+    vector<string> splitWords;
+    auto begin = originalText.begin();
+    auto end = originalText.end();
+    while (begin != end)
+    {
+        string word = "";
+        while (begin != delim && begin != end)
+        {
+            word += *begin;
+            ++begin;
+        }
+
+        splitWords.push_back(word);
+    }
+
+    return splitWords;
+
+}
+
+bool isStopWord(string &word)
+{
+    return stopWords.find(word) != nullptr;
+
+}
+
+string toLower(string &word)
+{
+    auto iter = word.begin();
+    lowerWord = "";
+    while (iter != word.end())
+    {
+        if (*iter < 65 || *iter > 90)
+        {
+            lowerWord += (*iter + 32);
+        }
+
+        else
+        {
+            lowerWord += *iter;
+        }
+    }
+
+    return lowerWord;
+}
+
+#endif //EECS398_SEARCH_STRINGPROCESSING_H