diff --git a/CMakeLists.txt b/CMakeLists.txt index 36c2b8bfa55356ac88178bc4498a7e7991588996..1ba63b3f1cc6019f5949eb6110b7fb5013487e85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,9 +270,40 @@ add_executable(query-Ranker-tests add_executable(testQueryLang query/queryLanguage/tests/testQueryParser.cpp query/queryLanguage/QueryParser.cpp - util/ + util/DataStructureLib/tuple.cpp + util/stringProcessing.cpp + util/Stemmer.cpp + + ) + +add_executable(testISRQueryTuple + query/queryLanguage/tests/queryIsrTest.cpp + query/queryLanguage/QueryParser.cpp + + util/DataStructureLib/tuple.cpp + util/stringProcessing.cpp + util/Stemmer.cpp + util/util.cpp + constraintSolver/ISRContainer.cpp + constraintSolver/ISR.h + query/Ranker/Ranker.cpp + query/Ranker/Site.cpp + query/Ranker/Scorer.cpp + constraintSolver/ISREndDoc.cpp + constraintSolver/ISRWord.cpp + constraintSolver/ISRAnd.cpp + constraintSolver/ISROr.cpp + ) + +add_executable(query-queryLanguage-tests + query/queryLanguage/QueryParser.cpp + + + ) + + find_package(OpenSSL REQUIRED) target_link_libraries(TryPopTest OpenSSL::SSL) @@ -287,3 +318,8 @@ target_link_libraries(crawler-parser-indexer-Test OpenSSL::SSL pthread) target_link_libraries(QueueTest pthread) + + + + + diff --git a/ISRAnd-tests b/ISRAnd-tests index e73b68b74e1bb0323aed5e74f647dc1e03270e6c..6b17bd2c6a0e1f5fb75e902a87ece94cf48acd93 100755 Binary files a/ISRAnd-tests and b/ISRAnd-tests differ diff --git a/constraintSolver/ISR.h b/constraintSolver/ISR.h index 244a3ac775b29bf2b33c7b4b71e203aaaf836baa..b8999047a7ac7f9064991c18060c340a8db5ab55 100644 --- a/constraintSolver/ISR.h +++ b/constraintSolver/ISR.h @@ -19,6 +19,7 @@ #define pathToIndex "/constraintSolver/index-test-files/twitter/" //#define pathToIndex "/buildIndex/" + //#define pathToIndex "/constraintSolver/index-test-files/twitter/" //#define pathToIndex "/build/" typedef size_t Location; // Location 0 is the null location. diff --git a/constraintSolver/ISRContainer.cpp b/constraintSolver/ISRContainer.cpp index 72c8a1a68957c6703aa53b15aff4b19c5ec9ccdd..8b00148ebf4931e2b1b1060894e4ab39ecd0cf80 100644 --- a/constraintSolver/ISRContainer.cpp +++ b/constraintSolver/ISRContainer.cpp @@ -3,3 +3,72 @@ // #include "ISRContainer.h" + + +ISRContainer::ISRContainer( Tuple * top ) : root( top ) + { + + compile( ); + } + + + +void ISRContainer::compile( ) + { + + + Contained = recurviseCompile( root ); + + + } + +ISR * ISRContainer::recurviseCompile( Tuple * root ) + { + vector< ISR * > terms; + if( root->Type == WordTupleType ) + return new ISRWord( root->object.text ); + + else + { + for( auto child : root->Next ) + terms.push_back( recurviseCompile( child ) ); + } + + if( root->Type == AndTupleType ) + return new ISRAnd ( terms ); + else + return new ISROr ( terms ); + + } + +void ISRContainer::Solve( ) + { + while(Contained->GetCurrentLocation() != MAX_Location) + { + auto url = Contained->GetEndDocument()->getCurrentDoc().url; + cout << url << endl; + + Contained->NextDocument( ); + + + +/* + * beg = GetBeginning of Doc + * Pass Terms to ranker + * + * vector<words> + * + * Ranker: + * for term in terms + * IsrWord word = new ISR(term) + * Term.seek(beg) + * words.push(word) + * rank(words) + * + * NextDocument() + */ + } + + + } + diff --git a/constraintSolver/ISRContainer.h b/constraintSolver/ISRContainer.h index 9f8821869acd0d974050b020ef204ba4209012bf..1c889d675b914d6918318490cafa3f1acccca613 100644 --- a/constraintSolver/ISRContainer.h +++ b/constraintSolver/ISRContainer.h @@ -6,7 +6,10 @@ #include "ISR.h" #include "ISREndDoc.h" +#include "ISRAnd.h" +#include "ISROr.h" #include "../query/Ranker/Ranker.h" +#include "../util/DataStructureLib/tuple.cpp" //Find occurrences of contained ISRs in a single document not containing any excluded ISRs. @@ -16,10 +19,16 @@ public: ISR *Contained; ISR *Excluded; vector<string> terms; + Tuple* root; + + ISRContainer( Tuple * tuple_in ); + ISR * recurviseCompile( Tuple * root ); unsigned CountContained, CountExcluded; + void compile( ); + Location Seek ( Location target ) { @@ -36,7 +45,7 @@ public: // 7. If any excluded ISR falls within the document, reset the //Â Â Â Â target to one past the end of the document and return to //Â Â Â Â step 1. - + return 1; }; @@ -46,31 +55,7 @@ public: * GetEndDocument * */ - void Solve() - { - - while(Contained->GetCurrentLocation() != MAX_Location) - { - -/* - * beg = GetBeginning of Doc - * Pass Terms to ranker - * - * vector<words> - * - * Ranker: - * for term in terms - * IsrWord word = new ISR(term) - * Term.seek(beg) - * words.push(word) - * rank(words) - * - * NextDocument() - */ - } - - - } + void Solve( ); private: @@ -78,5 +63,3 @@ private: Location nearestStartLocation, nearestEndLocation; Ranker ranker; }; - -}; \ No newline at end of file diff --git a/constraintSolver/ISRWord.cpp b/constraintSolver/ISRWord.cpp index 09e46368f92bf394d7b1b745c3f1fefc42026e5d..ac50f5f8fd55c0bebc3dcc6fe13b5c889512a2e6 100644 --- a/constraintSolver/ISRWord.cpp +++ b/constraintSolver/ISRWord.cpp @@ -176,7 +176,8 @@ void ISRWord::getWordSeek() { //check seek lookup table to find if offset+absulte is bigger than target //if so, set location to that big chunk //go to next chunk -Location ISRWord::Seek( Location target ) { +Location ISRWord::Seek( Location target ) + { if(target <= currentLocation) return currentLocation; @@ -210,6 +211,11 @@ Location ISRWord::Seek( Location target ) { +//Returns the location of the last item in the document you're currently at +Location ISRWord::GetEndDocumentLocation () const + { + return DocumentEnd->getCurrentDoc().docEndPosition; + } ISREndDoc * ISRWord::GetEndDocument() { @@ -219,6 +225,7 @@ ISREndDoc * ISRWord::GetEndDocument() + string ISRWord::GetTerm() { diff --git a/constraintSolver/ISRWord.h b/constraintSolver/ISRWord.h index a98114e77cbd0032a58046281c0f3ff642ae658f..ad9d9540ebe9a856dcadce6b05c4d6f57ad073e2 100644 --- a/constraintSolver/ISRWord.h +++ b/constraintSolver/ISRWord.h @@ -48,13 +48,14 @@ class ISRWord : public ISR size_t currentChunk; char *currentMemMap; - //set member variables to all of the chunks that occur, update current chunk void getChunks ( ); Location getCurrentLocation(); - size_t getFrequency(); - size_t getDocFrequency(); - size_t getLastLocation(); + size_t getFrequency(); + size_t getDocFrequency(); + size_t getLastLocation(); + + Location GetEndDocumentLocation() const; private: void getWordSeek(); diff --git a/constraintSolver/tests/ISRWordMultiTest.cpp b/constraintSolver/tests/ISRWordMultiTest.cpp index 4e94590b33e0201175171dfd73c3a0c9b784b6e1..43ef3432279fd34139344fae7e3df41ce954b2b4 100644 --- a/constraintSolver/tests/ISRWordMultiTest.cpp +++ b/constraintSolver/tests/ISRWordMultiTest.cpp @@ -68,10 +68,6 @@ int main ( ) } - - - - int i = 0; for(auto output : urls) { diff --git a/query/Ranker/Ranker.cpp b/query/Ranker/Ranker.cpp index 4a35a524ba96f3ebdb7b402d3c131f02984d598c..cc21caebd902da867ea4041b59330646b3559d32 100644 --- a/query/Ranker/Ranker.cpp +++ b/query/Ranker/Ranker.cpp @@ -16,30 +16,121 @@ * Initializes the ranker with the Word ISRs from the query, will most likely scale to add more input, * could possibly pull the information, depends on how much control we want */ -void Ranker::init ( vector<ISRWord> query ) +void Ranker::init ( vector<ISRWord> isrListInput ) { - inputQuery = query; + ISRList = isrListInput; } /*** - * This will perform the "sorting" and return a list of ranked URLS + * This will generate a map of the Site objects for each document * @return */ -vector< string > Ranker::rank ( ) +void Ranker::generateSiteList( ) + { + //Iterate through IsrWord vector and fill in Site information + for( auto isrWord = ISRList.begin(); isrWord < ISRList.end(); ++isrWord) + { + addWordtoSites( *isrWord ); + } + } + +//Create a new site with atttributes, or add word data to an existing site +void Ranker::addWordtoSites ( ISRWord isrWord ) + { + + string word = isrWord.term; + auto url = isrWord.DocumentEnd->getCurrentDoc ( ).url; + if( Websites.find( url ) != Websites.end( ) ) + { + Websites[ url ]->wordData[ word ] = getData( isrWord ); + + + } + else + { + Site * newSite = new Site(); + newSite->wordData[ word ] = getData( isrWord ); + newSite->url = url; + Websites[ url ] = newSite; + } + } + +data Ranker::getData( ISRWord isrWord) { ISREndDoc endDocs; - vector<size_t> locations; vector<DocumentEnding> docEnds; - set<string> urls; - for( auto queryWord = inputQuery.begin(); queryWord < inputQuery.end(); ++queryWord) + + unsigned long freq = 0; + + //FixME just gets the word frequency, add more useful functions as we add heuristics + + while ( isrWord.getCurrentLocation ( ) < isrWord.DocumentEnd->getCurrentDoc().docEndPosition) + { + isrWord.Next(); + ++freq; + } + + data wordData; + wordData.frequency = freq; + + return wordData; + } + +Ranker::~Ranker() + { + for( auto i = Websites.begin( ); i != Websites.end( ); ++i ) { - while ( queryWord->getCurrentLocation ( ) != MAX_Location ) + delete i->second; + } + } + +/*** + * Pushes all of the sites onto the priorityQueue, and scores then before pushing them on + * which puts them in their ranked order + */ +void Ranker::rank() + { + for( auto i = Websites.begin(); i != Websites.end( ); ++i) + { + i->second->getScore(); + WebsiteQueue.push( i->second ); + } + } + +void Ranker::printSites() + { + for( auto i = Websites.begin(); i != Websites.end(); ++i ) + { + cout << "URL: " << i->second->url << std::endl; + + for( auto j = Websites[ i->second->url ]->wordData.begin( ); j != Websites[ i->second->url ]->wordData.end( ); ++j) { - auto url = queryWord->DocumentEnd->getCurrentDoc ( ).url; - urls.insert ( url ); - queryWord->NextDocument ( ); + cout << j->first << ": " << j->second.frequency << std::endl; + } + } + cout << "\n\n\n"; + } + +void Ranker::printRankedSites() + { + cout << "----RANKED RESULTS----" << endl; + int size = WebsiteQueue.size(); + for( auto i = size; i > 0; --i ) + { + Site * website = WebsiteQueue.top(); + WebsiteQueue.pop(); + cout << "URL: " << website->url << std::endl; + + for( auto j = Websites[ website->url ]->wordData.begin( ); j != Websites[ website->url ]->wordData.end( ); ++j) + { + cout << j->first << ": " << j->second.frequency << std::endl; } } - } \ No newline at end of file + } + +// vector<size_t> locations; +// set<string> urls; +// urls.insert ( url ); + diff --git a/query/Ranker/Ranker.h b/query/Ranker/Ranker.h index c1bd00523f6fe09b79e713834fa9f46113de8507..183641df6f8fc49b8cbf39ae73ae797b56027565 100644 --- a/query/Ranker/Ranker.h +++ b/query/Ranker/Ranker.h @@ -10,6 +10,7 @@ #include <queue> #include "../../constraintSolver/ISRWord.h" #include "Site.h" +#include <unordered_map> /*** * Custom Comparator for the priority queue that keeps the websites in their correct order. @@ -17,9 +18,9 @@ class Comp { public: - bool operator()(Site L, Site R) + bool operator()(Site* L, Site* R) { - return L.getScore() > R.getScore(); + return L->getScore() < R->getScore(); } }; @@ -29,22 +30,43 @@ public: Ranker(){}; - Ranker( vector< ISRWord > query ){ - init( query ); + Ranker( vector< ISRWord > isrListInput ){ + init( isrListInput ); }; - vector< string > rank(); + + ~Ranker( ); + + void generateSiteList(); + void printSites(); + void printRankedSites(); + void rank(); + private: void init( vector< ISRWord> query ); //Queue to continuously sort the sites - priority_queue< Site, vector< Site> , Comp > WebsiteQueue; - vector< ISRWord > inputQuery; + priority_queue< Site * , vector< Site* > , Comp > WebsiteQueue; + vector< ISRWord > ISRList; //TODO: Not sure if we will need these vector< string > urls; - vector< Site > Websites; + unordered_map<string, Site * > Websites; + unordered_map< string , vector< unsigned long > > queryOffsets; + void addWordtoSites( ISRWord isrWord); + + data getData( ISRWord isrWord ); + + + /*** + * Ranker will work by doing these things: + * 1. Goes through all of the ISRWords in the vector and creates Sites for the new ones and adds information + * - If the Site already exists then just add the Site and attributes + * - Store the frequency of the word in the document + * 2. Go through the vector of Sites and score each, then push onto the priority queue as you score them + * + */ }; diff --git a/query/Ranker/Scorer.cpp b/query/Ranker/Scorer.cpp index 6a9879d6ac3cc9934333e509b297d9313684ffc5..0b48889d1f3ca06835cd2d14ac6358e80028cf20 100644 --- a/query/Ranker/Scorer.cpp +++ b/query/Ranker/Scorer.cpp @@ -21,7 +21,12 @@ double Scorer::getScore ( Site website) return score / (double)numberOfFunctions; } -double Scorer::Simple( Site ) +double Scorer::Simple( Site inputSite) { - return 42.0; + double score = 0; + for( auto i = inputSite.wordData.begin(); i != inputSite.wordData.end(); ++i ) + { + score+=i->second.frequency; + } + return score; } \ No newline at end of file diff --git a/query/Ranker/Site.h b/query/Ranker/Site.h index 95b57dce9ead070566fdebe30688ba452738d4bb..124796f4865957e5c51d00b0a794f11cfa83e4d3 100644 --- a/query/Ranker/Site.h +++ b/query/Ranker/Site.h @@ -8,6 +8,14 @@ #define EECS398_SEARCH_SITE_H #include <string> +#include <unordered_map> + + +struct data + { + unsigned long frequency; + }; + class Site { public: @@ -16,5 +24,6 @@ public: std::string url; double score; bool hasBeenScored; + std::unordered_map< std::string, data> wordData; }; #endif //EECS398_SEARCH_SITE_H diff --git a/query/Ranker/tests/RankerTest.cpp b/query/Ranker/tests/RankerTest.cpp index 29588f9e982767937aa61d9f53217a0e7d4626f7..63d388aad8f3c68e74a7c86243879757ccfd6a11 100644 --- a/query/Ranker/tests/RankerTest.cpp +++ b/query/Ranker/tests/RankerTest.cpp @@ -19,20 +19,47 @@ int main() void testSimple() { //Initialize Ran - ISRWord queryWord("%trump"); - ISREndDoc endDocs; - vector<size_t> locations; - vector<DocumentEnding> docEnds; + string query = "%everybodi"; + ISRWord queryWord(query); set<string> urls; - - while( queryWord.getCurrentLocation() != MAX_Location ) - { + clock_t start = clock(); + while(queryWord.getCurrentLocation() != MAX_Location) { auto url = queryWord.DocumentEnd->getCurrentDoc().url; - urls.insert( url ); + urls.insert( url ); queryWord.NextDocument(); - } + clock_t end = clock(); + + cout << "Time to complete query: " << (end - start) / (double) CLOCKS_PER_SEC << endl; + for(auto url :urls) + cout << url << endl; + + cout << "Number of results: " << urls.size() << "\n\n\n"; + ISRWord queryWord1(query); + ISRWord queryWord2(query); + queryWord2.NextDocument(); + ISRWord queryWord3(query); + queryWord3.NextDocument(); + + ISRWord queryWord4(query); + queryWord4.NextDocument(); + queryWord4.NextDocument(); + + + vector< ISRWord > wordList; + wordList.push_back( queryWord1 ); + wordList.push_back( queryWord2 ); + wordList.push_back( queryWord3 ); + wordList.push_back( queryWord4 ); + + + Ranker rankeyboi = Ranker( wordList ); + rankeyboi.generateSiteList(); + rankeyboi.printSites(); + rankeyboi.rank(); + rankeyboi.printRankedSites(); + } diff --git a/query/queryLanguage/QueryParser.cpp b/query/queryLanguage/QueryParser.cpp index e42ee1c30e9886cd78d6cd052e2bedd03eda93d8..a89c5fcf6f37197bb7029de22c0dbcd019808b5c 100644 --- a/query/queryLanguage/QueryParser.cpp +++ b/query/queryLanguage/QueryParser.cpp @@ -2,13 +2,18 @@ // Created by Zane Dunnings on 3/16/18. // -//Outline of query language from Prof. Nicole Hamilton, University of Michigan 03/15/2018 -// 72 lines #include "QueryParser.h" #include<unordered_set> #include "../../util/stringProcessing.h" #include<iostream> - +/*** + * QUERY PARSER CLASS + * + * 1. Constraint() - CAll this at the highest level, will split on ORs if there are ORs at the highest level, + * Will split on AND if theres an AND at the highest level. b + * + * + */ /*** * Returns a token of the next word in the query, past the given index * @param index @@ -171,9 +176,9 @@ bool QueryParser::MatchAND( string input ) ORMatch.insert("AND"); ORMatch.insert("&"); ORMatch.insert("&&"); -3ww ORMatch.insert("and"); + ORMatch.insert("and"); - if( 3ORMatch.count( input ) > 0 ) + if( ORMatch.count( input ) > 0 ) { return true; } @@ -195,6 +200,8 @@ Tuple* QueryParser::Constraint( string input ) t->Type = OrTupleType; else t->Type = AndTupleType; + Tuple* toBeKilled = constraintList[ 0 ]; + constraintList = breakOnAND ( input ); t->Next = constraintList; //Iterate through the subcontraints and if there are ORs, then run this again, else split on and for each @@ -255,33 +262,6 @@ vector<Tuple * > QueryParser::breakOnOR( string input ) if( query[ i ] == "(") { ++depth; -// ++depth; -// ++i; -// start = i; -// while(depth != 0 && ( i < query.size()) ) -// { -// if (query[ i ] == "(") -// ++depth; -// else if (query[ i ] == ")") -// --depth; -// if (depth == 0) -// { -// --i; -// break; -// } -// else -// ++i; -// } -// if (i == query.size()) -// i = query.size() - 1; -// string text = ""; -// for ( int j = start; j < i; ++ j) -// { -// text+= query[ j ]; -// } -// Tuple * t = new Tuple( text ); -// constraintList.push_back( t ); -// t->Type = AndTupleType; } else if( query[ i ] == ")") { diff --git a/query/queryLanguage/tests/queryIsrTest.cpp b/query/queryLanguage/tests/queryIsrTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed1a256ec4b9940cfd54b1bb87535a2131470fd9 --- /dev/null +++ b/query/queryLanguage/tests/queryIsrTest.cpp @@ -0,0 +1,41 @@ +// +// Created by Jake Close on 4/7/18. +// + + +#include "../QueryParser.h" +#include<iostream> +#include <fstream> +#include "../../../constraintSolver/ISRContainer.h" + +int main() + { + string query = "moment life"; + string OR = "bike cycle "; + QueryParser parser; + parser.parse( query ); + + Token orParentToken = Token("-OR-"); + Tuple * orparent = new Tuple( orParentToken ); + + Token bike = Token("bike"); + Token cycle = Token("cycle"); + + Tuple* bikeTuple = new Tuple( bike ); + Tuple* cycleTuple = new Tuple( cycle ); + + orparent->Next.push_back(bikeTuple); + orparent->Next.push_back( cycleTuple); + + + parser.printCompiledQuery(); + + + //parser.queryTree->Next.push_back( orparent ); + + + + ISRContainer container ( parser.queryTree ); + + container.Solve( ); + } \ No newline at end of file diff --git a/testISRQueryTuple b/testISRQueryTuple new file mode 100755 index 0000000000000000000000000000000000000000..7903f40157e615a83cb2b052a8ff2a44057dd35d Binary files /dev/null and b/testISRQueryTuple differ diff --git a/testQueryLang b/testQueryLang new file mode 100755 index 0000000000000000000000000000000000000000..eb64861fcb55cb5a559470c3efdf6b340ca3cc63 Binary files /dev/null and b/testQueryLang differ diff --git a/util/DataStructureLib/tuple.cpp b/util/DataStructureLib/tuple.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b95ab4e9b9297a20d8b9238c99c89f5d6e5e9f3 --- /dev/null +++ b/util/DataStructureLib/tuple.cpp @@ -0,0 +1,79 @@ +// +// Created by Zane Dunnings on 3/17/18. +// + +// Outline of query language from Prof. Nicole Hamilton, University of Michigan 03/15/2018 +//31 lines +#pragma once +#include<string> +#include<vector> +#include "../../parser/Parser.h" +//#include "../../constraintSolver/ISRAnd.h" +using namespace std; + + +class Token + { +public: + Token() + :text(""), end( true ), OR( false ), AND( false ){} + Token( string input ) + :text( input ), end( false ), OR( false ), AND( false ) + { + if(input == "-OR-" ) + OR = true; + else if(input == "-AND-") + AND=true; + } + //TODO: This is for scaling to add more ISR types + string text; + bool OR; + bool AND; + bool end; + }; + +enum TupleType + { + PhraseTupleType, + OrTupleType, + AndTupleType, + NotTupleType, + WordTupleType + }; + +class Tuple + { +public: + + Token object; + vector<Tuple *> Next; + TupleType Type; + //ISR *Compile( ); + Tuple( ) + : object( Token() ), Type( AndTupleType ) {} + Tuple( Token input ) + : object( input ), Type( AndTupleType ) + { + if(input.AND) + Type = AndTupleType; + else if (input.OR) + Type = OrTupleType; + else + Type = WordTupleType; + } + Tuple( TupleType type) + : object( Token( ) ), Type( type ) + { + switch( type ) + { + case( AndTupleType ): + object = Token("-AND-"); + break; + case( OrTupleType ): + object = Token("-OR-"); + default: + break; + } + } + + }; \ No newline at end of file