parser changes

eb55e2f3 · vcday · fbbf2bf0 · eb55e2f3 · eb55e2f3
Commit eb55e2f3 authored 7 years ago by vcday
--- a/Parser.cpp
+++ b/Parser.cpp
@@ -12,7 +12,8 @@
 #include <queue>
 #include <iostream>
 #include <fstream>
-#include <stringProcessing.h>
+#include "Tokenizer.h"
+#include "stringProcessing.h"

 using namespace std;

@@ -49,15 +50,13 @@ public:
     */
    // input: object with char*  and URL string
    //
-    string execute()
+    const unordered_map<string, vector<int>> execute()
    {
-        Tokenizer tokenizer();
+        Tokenizer tokenizer;
        //TEMP - until we get real input from crawler
        raw_data data("url", "html");
        parse(data.html_data, &tokenizer);
-        return tokenzier.get();
-
-
+        return tokenizer.get();
    }


@@ -72,22 +71,17 @@ private:
    string parse(string &html_data, Tokenizer *tokenizer)
    {
        //figure out file handle syntax - pointer to file
-        tokenizerInput = "";
-        currentTerm = "";
+        string tokenizerInput = "";
+        string currentTerm = "";
        for (int i = 0; i < html_data.size(); ++i) {
            while (html_data[i] != ' ') {
                currentTerm += html_data[i];
            }

            //one method that directly adds urls onto frontier instead of checking for them
-            if (!check_title(currentTerm)) {
-                add_urls(current_term);
-            }
-
-            else {
-                tokenizerInput += currentTerm;
-                //can also pass titles individually through tokenizer instead of concatonating (idk how to spell)
-            }
+	        add_urls(currentTerm);
+            check_title(currentTerm);
+            tokenizerInput += currentTerm;
        }

        tokenizer->execute(tokenizerInput);
@@ -104,12 +98,12 @@ private:
        string http_end_tag = ">";

        auto word_iter = word.begin();
-        url = "";
+        string url = "";
        word_iter = findStr(word_iter, a_tag);
-        if (word_iter) {
+        if (word_iter != nullptr) {
            auto found_http = findStr(word_iter, http_start);
-            if (found_http) {
-                url = "http";
+            if (found_http != nullptr) {
+	            url = "http";
                found_http += 9;
                auto end_http = findStr(word_iter, http_end_tag);
                while (found_http != end_http) {
@@ -129,36 +123,6 @@ private:


    }
-    /**
-     * Checks for url in string word
-     * @param word
-     */
-    bool check_url(string &word)
-    {
-        //need to add string processing function where you check in a specified range of positions
-        if (char* pos = strstr("href", word))
-        {
-            while (pos != "\"" && pos != "\'")
-                ++pos;
-
-            // take everything until next quote
-            string url = "";
-            ++pos;
-            while (pos != "\"" && pos != "\'")
-            {
-                //filter out everything except http, https
-                url += *pos;
-            }
-
-            // send it back to the crawler
-            URL_PQ.push(url);
-            return true;
-
-        }
-
-        return false;
-    }
-
    /**
     * <title >AJF</title>
     * @param word
@@ -166,10 +130,10 @@ private:

    bool check_title(string &word)
    {
-        /*if (char* pos = strstr("<title>", word))
+        if (char* pos = strstr("<title>", word))
        {
            pos += 6;
-            end_pos = strstr("</title>", word);
+            auto end_pos = strstr("</title>", word);
            string title = "";
            while (pos != end_pos)
            {
@@ -179,105 +143,13 @@ private:
            }

            return title;
-        }*/
+        }

-        begin_title = "<title>";
-        auto word_begin = word.begin();
-        auto word_iter = findStr();
+//        string begin_title = "<title>";
+//        auto word_begin = word.begin();
+//        auto word_iter = findStr(word_begin, begin_title);

    }
-<<<<<<< HEAD
-	 /**
-	  * Checks for url in string word
-	  * @param word
-	  */
-	 bool check_url(string &word)
-	 {
-		 if (char* pos = strstr("href", word))
-		 {
-			 while (pos != "\"" && pos != "\'")
-				 ++pos;
-
-			 // take everything until next quote
-			 string url = "";
-			 ++pos;
-			 while (pos != "\"" && pos != "\'")
-			 {
-				 //filter out everything except http, https
-				 url += *pos;
-			 }
-
-			 // send it back to the crawler
-			 URL_PQ.push(url);
-             return true;
-
-		 }
-
-         return false;
-	 }
-
-	 /**
-	  * <title >AJF</title>
-	  * @param word
-	  */
-
-     string check_title_handle(string &word)
-     {
-         string titleTag = "<title>";
-         string closeTitleTag = "</title>";
-         auto wordBegin = word.begin();
-
-         string allTitles = "";
-         while (wordBegin != nullptr)
-         {
-             wordBegin = findStr(wordBegin, titleTag);
-
-             if (wordBegin == nullptr)
-             {
-                 return allTitles;
-             }
-
-             //increments until first letter past opening title tag
-             wordBegin += 7;
-             auto end_title = findStr(wordBegin, closeTitleTag);
-             while (wordBegin != end_title)
-             {
-                 allTitles += *wordBegin;
-                 ++wordBegin;
-
-                 if (wordBegin == nullptr)
-                 {
-                     return allTitles;
-                 }
-             }
-
-             //increments until first letter past closing title tag
-             wordBegin += 8;
-         }
-
-         return allTitles;
-     }
-	 string check_title(string &word)
-	 {
-		 if (char* pos = strstr("<title>", word))
-		 {
-			 pos += 6;
-			 end_pos = strstr("</title>", word);
-			 string title = "";
-			 while (pos != end_pos)
-			 {
-				 ++pos;
-				 title += *pos;
-
-			 }
-
-			 return title;
-		 }
-
-	 }
-
-	//TODO
-

 };

--- a/Tokenizer.cpp
+++ b/Tokenizer.cpp