From 4fe3b4ee8ba5e26035becc94fbbb1b6008e498db Mon Sep 17 00:00:00 2001
From: aanvi <aanvi@umich.edu>
Date: Tue, 13 Feb 2018 18:18:32 -0500
Subject: [PATCH] Adding functions

---
 Parser.cpp         |  77 ++++++++++++++++++++++++++++++++++
 stringProcessing.h | 100 +++++++++++++++++++++++++++++++++++++++++++++
 test.cpp           |  20 +++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 test.cpp

diff --git a/Parser.cpp b/Parser.cpp
index 3aaf3e1..e4669fe 100644
--- a/Parser.cpp
+++ b/Parser.cpp
@@ -6,11 +6,13 @@
 // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
 //
 
+
 #include <string>
 #include <functional>
 #include <queue>
 #include <iostream>
 #include <fstream>
+#include <stringProcessing.h>
 
 using namespace std;
 
@@ -88,6 +90,44 @@ private:
 
 	 }
 
+    /*
+     * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
+     * Instead of bool, just directly adds on to url queue
+     */
+    void add_urls(string word)
+    {
+        string http_tag = "<a href=http";
+
+        auto word_iter = word.begin();
+        auto http_substr = http_tag.begin();
+        string url_name = "";
+
+        //will add all instances of wanted URLs until it hits end of string
+        while (word_iter != nullptr)
+        {
+            // sets word_iterator to next instance of URL from where it currently is
+            word_iter = findStr(word_iter, http_substr);
+            //in the case there is no url in the word
+            if (word_iter != nullptr)
+            {
+                //sets iterator to start of url content
+                word_iter += 12;
+                url_name = "http";
+                //end of url tag
+                auto tag_end = ("</a>", word);
+                while (word_iter != tag_end)
+                {
+                    url_name += *word_iter;
+                    ++word_iter;
+                }
+                //sets iterator to one past the closing tag </a>
+                word_iter += 4;
+                URL_PQ.push(url_name);
+                //resets substr so it can look for next instance
+                http_substr = http_tag.begin();
+            }
+        }
+    }
 	 /**
 	  * Checks for url in string word
 	  * @param word
@@ -121,6 +161,43 @@ private:
 	  * <title >AJF</title>
 	  * @param word
 	  */
+
+     string check_title_handle(string &word)
+     {
+         string titleTag = "<title>";
+         string closeTitleTag = "</title>";
+         auto wordBegin = word.begin();
+
+         string allTitles = "";
+         while (wordBegin != nullptr)
+         {
+             wordBegin = findStr(wordBegin, titleTag);
+
+             if (wordBegin == nullptr)
+             {
+                 return allTitles;
+             }
+
+             //increments until first letter past opening title tag
+             wordBegin += 7;
+             auto end_title = findStr(wordBegin, closeTitleTag);
+             while (wordBegin != end_title)
+             {
+                 allTitles += *wordBegin;
+                 ++wordBegin;
+
+                 if (wordBegin == nullptr)
+                 {
+                     return allTitles;
+                 }
+             }
+
+             //increments until first letter past closing title tag
+             wordBegin += 8;
+         }
+
+         return allTitles;
+     }
 	 string check_title(string &word)
 	 {
 		 if (char* pos = strstr("<title>", word))
diff --git a/stringProcessing.h b/stringProcessing.h
index 239b15b..bc830b6 100644
--- a/stringProcessing.h
+++ b/stringProcessing.h
@@ -8,18 +8,118 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <iterator>
 
 using namespace std;
 
 //TODO
 //remove tag words
 
+//assuming we have a buffer from File Handle
+//returns pointer to subStr in original text
+/*char* preFindStr(char* originalText, char* subStr)
+{
+    bool isFound = true;
+    char* temp = originalText;
+    while ( isFound )
+    {
+        if ( *originalText == *subStr )
+        {
+
+        }
+    }
+}*/
+/*char* postFindStr(char* originalText, char* subStr)
+{
+    bool isFound = true;
+    while ( originalText != nullptr )
+    {
+        if ( *originalText != *subStr )
+        {
+            ++originalText;
+        }
+
+        if ( *originalText == *substr )
+        if (originalText == nullptr || substr == nullptr)
+        {
+            isFound = false;
+        }
+
+        else if ( *originalText != *subStr )
+        {
+            isFound = false;
+        }
+
+        else
+        {
+            isFound = true;
+        }
+
+        ++subStr;
+        ++originalText;
+    }
+
+    return originalText;
+
+}*/
+
+
+/*
+ * Takes in an iterator to the original text and a substring: specifically for a parser functionality
+ * Potentially make one that takes in two strings? Is this needed?
+ */
+string::iterator findStr(string::iterator originalText, string &subStr)
+{
+
+    auto begin_sub = subStr.begin();
+    auto begin_original = originalText;
+
+    while ( begin_original != nullptr)
+    {
+        //keep looking for instance of a match
+        if ( *begin_original != *begin_sub )
+        {
+            ++begin_original;
+        }
+
+        else if ( *begin_original == *begin_sub )
+        {
+            /* want to keep the original iterator where it is so it
+               can return the beginning of the matched word if found */
+            auto temp = begin_original;
+            while ( *temp == *begin_sub )
+            {
+                ++temp;
+                ++begin_sub;
+                //if it hits the end of the substring, it signifies an exact match
+                if ( begin_sub  == nullptr)
+                {
+                    //this is pointing at the beginning of the match
+                    return begin_original;
+                }
+
+            }
+            //need to reset because still has to search rest of the string for a match
+            begin_sub = subStr.begin();
+            //sets the original text pointer to where the last search left off
+            begin_original = temp;
+        }
+
+        else
+        {
+            //DO NOTHING
+        }
+    }
+
+    return begin_original;
+}
 set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
                          "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
                          "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
 
 vector<string> splitStr(string &originalText, char delim)
 {
+
     vector<string> splitWords;
     auto begin = originalText.begin();
     auto end = originalText.end();
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..155d22d
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,20 @@
+//
+// Created by anvia on 2/6/2018.
+//
+
+#include <string>
+#include <stringProcessing.h>
+#include <iostream>
+using namespace std;
+
+int main()
+{
+    string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout."
+            "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
+            "making it look like readable English. ";
+
+    string subStr = "readable";
+    auto iter = findStr(subStr, original);
+    cout << *iter << endl;
+}
+
-- 
GitLab