Skip to content
Snippets Groups Projects
Commit 4fe3b4ee authored by aanvi's avatar aanvi
Browse files

Adding functions

parent 0755de6a
No related branches found
No related tags found
No related merge requests found
......@@ -6,11 +6,13 @@
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#include <string>
#include <functional>
#include <queue>
#include <iostream>
#include <fstream>
#include <stringProcessing.h>
using namespace std;
......@@ -88,6 +90,44 @@ private:
}
/*
* Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
* Instead of bool, just directly adds on to url queue
*/
void add_urls(string word)
{
string http_tag = "<a href=http";
auto word_iter = word.begin();
auto http_substr = http_tag.begin();
string url_name = "";
//will add all instances of wanted URLs until it hits end of string
while (word_iter != nullptr)
{
// sets word_iterator to next instance of URL from where it currently is
word_iter = findStr(word_iter, http_substr);
//in the case there is no url in the word
if (word_iter != nullptr)
{
//sets iterator to start of url content
word_iter += 12;
url_name = "http";
//end of url tag
auto tag_end = ("</a>", word);
while (word_iter != tag_end)
{
url_name += *word_iter;
++word_iter;
}
//sets iterator to one past the closing tag </a>
word_iter += 4;
URL_PQ.push(url_name);
//resets substr so it can look for next instance
http_substr = http_tag.begin();
}
}
}
/**
* Checks for url in string word
* @param word
......@@ -121,6 +161,43 @@ private:
* <title >AJF</title>
* @param word
*/
string check_title_handle(string &word)
{
string titleTag = "<title>";
string closeTitleTag = "</title>";
auto wordBegin = word.begin();
string allTitles = "";
while (wordBegin != nullptr)
{
wordBegin = findStr(wordBegin, titleTag);
if (wordBegin == nullptr)
{
return allTitles;
}
//increments until first letter past opening title tag
wordBegin += 7;
auto end_title = findStr(wordBegin, closeTitleTag);
while (wordBegin != end_title)
{
allTitles += *wordBegin;
++wordBegin;
if (wordBegin == nullptr)
{
return allTitles;
}
}
//increments until first letter past closing title tag
wordBegin += 8;
}
return allTitles;
}
string check_title(string &word)
{
if (char* pos = strstr("<title>", word))
......
......@@ -8,18 +8,118 @@
#include <string>
#include <unordered_map>
#include <vector>
#include <iterator>
using namespace std;
//TODO
//remove tag words
//assuming we have a buffer from File Handle
//returns pointer to subStr in original text
/*char* preFindStr(char* originalText, char* subStr)
{
bool isFound = true;
char* temp = originalText;
while ( isFound )
{
if ( *originalText == *subStr )
{
}
}
}*/
/*char* postFindStr(char* originalText, char* subStr)
{
bool isFound = true;
while ( originalText != nullptr )
{
if ( *originalText != *subStr )
{
++originalText;
}
if ( *originalText == *substr )
if (originalText == nullptr || substr == nullptr)
{
isFound = false;
}
else if ( *originalText != *subStr )
{
isFound = false;
}
else
{
isFound = true;
}
++subStr;
++originalText;
}
return originalText;
}*/
/*
* Takes in an iterator to the original text and a substring: specifically for a parser functionality
* Potentially make one that takes in two strings? Is this needed?
*/
string::iterator findStr(string::iterator originalText, string &subStr)
{
auto begin_sub = subStr.begin();
auto begin_original = originalText;
while ( begin_original != nullptr)
{
//keep looking for instance of a match
if ( *begin_original != *begin_sub )
{
++begin_original;
}
else if ( *begin_original == *begin_sub )
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = begin_original;
while ( *temp == *begin_sub )
{
++temp;
++begin_sub;
//if it hits the end of the substring, it signifies an exact match
if ( begin_sub == nullptr)
{
//this is pointing at the beginning of the match
return begin_original;
}
}
//need to reset because still has to search rest of the string for a match
begin_sub = subStr.begin();
//sets the original text pointer to where the last search left off
begin_original = temp;
}
else
{
//DO NOTHING
}
}
return begin_original;
}
set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
vector<string> splitStr(string &originalText, char delim)
{
vector<string> splitWords;
auto begin = originalText.begin();
auto end = originalText.end();
......
test.cpp 0 → 100644
//
// Created by anvia on 2/6/2018.
//
#include <string>
#include <stringProcessing.h>
#include <iostream>
using namespace std;
int main()
{
string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout."
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
string subStr = "readable";
auto iter = findStr(subStr, original);
cout << *iter << endl;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment