diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e5b7541f369a769ce9a9d62ddfab60dcf48df9d..c6d356cc6b1e4c73c5865ed74935dd7584146d2a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,17 @@ add_executable(ISRWord-Multi-Test util/stringProcessing.cpp util/Stemmer.cpp ) +add_executable(ISRMultiWordORTest + constraintSolver/tests/ISRMultiWordORTest.cpp + util/util.cpp + constraintSolver/ISR.cpp + constraintSolver/ISRWord.cpp + constraintSolver/ISRAnd.cpp + constraintSolver/ISREndDoc.cpp + util/stringProcessing.cpp + util/Stemmer.cpp ) + + add_executable(DataStructures-Vector-tests DataStructures/Vector/Vector.h @@ -241,6 +252,19 @@ add_executable(Corpus-tests util/util.cpp ) +add_executable(query-Ranker-tests + query/Ranker/tests/RankerTest.cpp + query/Ranker/Ranker.h + query/Ranker/Scorer.h + query/Ranker/Site.h + query/Ranker/Ranker.cpp + query/Ranker/Scorer.cpp + query/Ranker/Site.cpp + util/util.cpp + constraintSolver/ISRWord.cpp + constraintSolver/ISREndDoc.cpp + util/stringProcessing.cpp + util/Stemmer.cpp ) find_package(OpenSSL REQUIRED) @@ -255,3 +279,4 @@ target_link_libraries(crawler-parser-Test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-indexer-Test OpenSSL::SSL pthread) target_link_libraries(QueueTest pthread) + diff --git a/DataStructures/DiskHashTable/DiskHashTableTests.cpp b/DataStructures/DiskHashTable/DiskHashTableTests.cpp index e7d9a3db53892aab9e4e286727ba63460584617b..d56529879f47995d942d8517c313a82da0012e9e 100644 --- a/DataStructures/DiskHashTable/DiskHashTableTests.cpp +++ b/DataStructures/DiskHashTable/DiskHashTableTests.cpp @@ -22,43 +22,6 @@ int main() { DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test1.txt", 10, 8); vector<pair<string, string>> data; -// data.push_back({"sherlock", "holmes"}); -// data.push_back({"kendrick", "lamar"}); -// data.push_back({"hello", "goodbye"}); -// data.push_back({"moon", "landing"}); -// data.push_back({"barack", "obama"}); -// data.push_back({"katy", "perry"}); -// data.push_back({"anderson", "paak"}); -// data.push_back({"dunder", "mifflin"}); -// data.push_back({"university", "michigan"}); -// data.push_back({"abel", "tesfaye"}); -// data.push_back({"vince", "staples"}); -// data.push_back({"danny", "brown"}); -// data.push_back({"ann", "arbor"}); -// data.push_back({"tame", "impala"}); -// data.push_back({"machine", "learning"}); -// data.push_back({"north", "face"}); -// data.push_back({"eecs", "398"}); -// data.push_back({"intel", "corei7"}); -// data.push_back({"constraint", "solver"}); -// data.push_back({"multi", "threaded"}); -// data.push_back({"march", "madness"}); -// data.push_back({"sister", "nation"}); -// data.push_back({"daft", "punk"}); -// data.push_back({"the god", "anddevil"}); -// data.push_back({"are raging", "insideme"}); -// data.push_back({"hiatus", "kaiyote"}); -// data.push_back({"jai", "wolf"}); -// data.push_back({"griz", "psgfy"}); -// data.push_back({"stack", "overflow"}); -// data.push_back({"carpenter", "brut"}); -// data.push_back({"harry", "potter"}); -// data.push_back({"fall out", "boy"}); -// data.push_back({"red hot", "chili"}); -// data.push_back({"after", "laughter"}); -// data.push_back({"carly rae", "jepsen"}); -// data.push_back({"lana del", "rey"}); -// data.push_back({"system of", "a down"}); double totalInsertionTime = 0.0; for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { diff --git a/DataStructures/DiskHashTable/MMDiskHashTable.h b/DataStructures/DiskHashTable/MMDiskHashTable.h index dd4cf281bb6cda496053b1b17cb816724bfc1805..dc16044492bde4e92a832d6366f478250a7feb2c 100644 --- a/DataStructures/DiskHashTable/MMDiskHashTable.h +++ b/DataStructures/DiskHashTable/MMDiskHashTable.h @@ -115,6 +115,7 @@ public: for(size_t i = 0; i < 10; i++) { map[i] = sizeString[i]; } + return false; } /** diff --git a/DataStructures/HashTable/HashTable.h b/DataStructures/HashTable/HashTable.h index 4c1b04bb807ae4f67c2eec940b6a832629771cde..adfdab5c84e6df6344eebf7670e0ae2e146211a2 100644 --- a/DataStructures/HashTable/HashTable.h +++ b/DataStructures/HashTable/HashTable.h @@ -123,8 +123,6 @@ public: HashTable ( const HashTable & copy ) { this->numElements = copy.numElements; - this->numCollisions = copy.numCollisions; - this->maxCollisions = copy.maxCollisions; this->numBuckets = copy.numBuckets; this->sizeVector = copy.sizeVector; array.resize ( sizeVector ); diff --git a/DataStructures/HashTable/HashTableTests.cpp b/DataStructures/HashTable/HashTableTests.cpp index 08d921bfceb9ceb2bb86baa264753b896288ac21..8b77cd8805aae35d707d907d0e82c1b6bb142c8a 100644 --- a/DataStructures/HashTable/HashTableTests.cpp +++ b/DataStructures/HashTable/HashTableTests.cpp @@ -3,11 +3,25 @@ // #include <iostream> +#include <cassert> +#include <string> #include "HashTable.h" using namespace std; -int main() { +void testInsert(); +void testRehash(); +void testCollisions(); + +int main( ) + { + cout << "Begin testHashTable:" << endl << endl; + + testInsert(); + testRehash(); + testCollisions(); + + //Nicks tests HashTable<string, string> ht; ht["hello"] = "goodbye"; @@ -15,6 +29,107 @@ int main() { ht["i want you to feel"] = "surreal"; assert(ht["i want you to feel"] == "surreal"); + cout << endl << "ALL TESTS PASSED :)" << endl; + } + +void testInsert( ) + { + cout << "Testing testInsert..." << endl; + HashTable < string, int > lib; + lib["Four"] = 4; + lib[ "Hundred" ] = 100; + lib[ "Twenty" ] = 20; + + assert( lib[ "Four" ] == 4); + assert( lib[ "Hundred"] == 100 ); + assert( lib[ "twenty" ] != 20 ); + assert( lib[ "Twenty" ] == 20 ); + + HashTable < string, string> string_string; + string_string["donald trump"] = "lol"; + assert( string_string.size( ) == 1 ); + string_string["kendrick"] = "lamar"; + assert( string_string.size( ) == 2 ); + string_string["iphone"] = "apple"; + assert( string_string.size( ) == 3 ); + string_string["iphone1"] = "apple"; + assert( string_string.size( ) == 4 ); + string_string["iphone2"] = "apple"; + assert( string_string.size( ) == 5 ); + string_string["iphone3"] = "apple"; + assert( string_string.size( ) == 6 ); + string_string["iphone4"] = "apple"; + assert( string_string.size( ) == 7 ); + string_string["iphone5"] = "apple"; + assert( string_string.size( ) == 8 ); + string_string["iphone6"] = "apple"; + assert( string_string.size( ) == 9 ); + string_string["iphone7"] = "apple"; + assert( string_string.size( ) == 10 ); + string_string[ "YUGE" ]; + assert( string_string["YUGE"].empty( ) ); + assert( string_string["kendrick"] == "lamar" ); + cout << "testInsert Passed!" << endl; + } + +void testRehash() + { + cout << "Testing testRehash..." << endl; + HashTable < string, int > lib; + + //Pushback enough items so it hits 50% load factor, and must rehash + lib[ "one" ] = 1; + lib[ "two" ] = 2; + lib[ "three" ] = 3; + lib[ "four" ] = 4; + lib[ "five" ] = 5; + lib[ "six" ] = 6; + lib[ "seven" ] = 7; + lib[ "eight" ] = 8; + lib[ "nine" ] = 9; + lib[ "ten" ] = 10; + + assert( lib.capacity( ) == 32); + assert( lib.size( ) == 10); + + //Ensure that it rehashes to correct value + assert( lib[ "one" ] == 1 ); + assert( lib[ "two" ] == 2 ); + assert( lib[ "three" ] == 3 ); + assert( lib[ "four" ] == 4 ); + assert( lib[ "five" ] == 5 ); + assert( lib[ "six" ] == 6 ); + assert( lib[ "seven" ] == 7 ); + assert( lib[ "eight" ] == 8 ); + assert( lib[ "nine" ] == 9 ); + assert( lib[ "ten" ] == 10 ); + lib[ "eleven" ] = 11; + assert( lib[ "eleven" ] == 11 ); + assert( lib.capacity( ) == 32 ); + assert( lib.size( ) == 11 ); + cout << "testRehash Passed!" << endl; + } + +void testCollisions( ) + { + cout << "Testing testCollisions..." << endl; + HashTable < string, int > lib; + lib["Four"] = 4; + lib[ "Hundred" ] = 100; + lib[ "Twenty" ] = 20; + lib[ "one" ] = 1; + lib[ "two" ] = 2; + lib[ "three" ] = 3; + lib[ "four" ] = 4; + lib[ "five" ] = 5; + lib[ "six" ] = 6; + lib[ "seven" ] = 7; + lib[ "eight" ] = 8; + lib[ "nine" ] = 9; + lib[ "ten" ] = 10; + //assert(lib.getNumCollisions() == 5); + //assert(lib.getMaxCollisions() == 2); + //assert(lib.getNumBuckets() == 8); - return 0; -} \ No newline at end of file + cout << "testCollisions Passed!" << endl; + } diff --git a/DataStructures/Vector/VectorTests.cpp b/DataStructures/Vector/VectorTests.cpp index 5f2e732dbabe5c21ddf8851c2ac891ee7d3979cb..0489245ae8e6cdd5b30d1ae9dcd71937e362eba0 100644 --- a/DataStructures/Vector/VectorTests.cpp +++ b/DataStructures/Vector/VectorTests.cpp @@ -4,16 +4,30 @@ #include <iostream> #include <cassert> +#include <string> #include "Vector.h" -#include <vector> -using namespace std; +void testPushBack( ); +void testResize( ); +void testReserve( ); +void testIterator( ); +void testStruct( ); +void testPointers( ); -// TOOO: COMPREHENSIVE TESTS -int main() { +int main( ) + { + std::cout << "Beginning Array tests... \n"; + testPushBack( ); + testReserve( ); + testResize( ); + testIterator( ); + testStruct( ); + testPointers( ); /*Fails this test*/ + + //Nicks testing below Vector<int> test1; test1.reserve(20); - vector<int> test62; + Vector<int> test62; test62.reserve(20); test1[51] = 52; test62[51] = 352; @@ -23,6 +37,143 @@ int main() { test1[0] = 150; assert(test1[0] == 150); assert(test1.size() == 1); - cout << "ALL VECTOR TESTS PASS :)" << endl; -} + + std::cout << "Array tests successful! :)"; + } + +void testPushBack( ) + { + std::cout << "Testing pushback ... \n"; + Vector< int > Arr; + assert( Arr.size( ) == 0); + Arr.push_back(1); + assert( Arr[ 0 ] == 1); + assert( Arr.size( ) == 1); + Arr.push_back(4); + assert( Arr[ 1 ] == 4); + assert( Arr.size( ) == 2); + + Arr.push_back(3); + assert( Arr[ 2 ] == 3); + assert( Arr.size( ) == 3); + + Arr.push_back(2); + assert( Arr[ 0 ] == 1); + assert( Arr[ 1 ] == 4); + assert( Arr[ 2 ] == 3); + assert( Arr[ 3 ] == 2); + assert( Arr.size( ) == 4); + + Vector< std::string > StringArr; + StringArr.push_back("one"); + assert( StringArr[ 0 ] == "one"); + assert( StringArr.size( ) == 1); + + StringArr.push_back("four"); + assert( StringArr[ 1 ] == "four"); + assert( StringArr.size( ) == 2); + + StringArr.push_back("three"); + assert( StringArr[ 2 ] == "three"); + assert( StringArr.size( ) == 3); + + StringArr.push_back("two"); + assert( StringArr[ 0 ] == "one"); + assert( StringArr[ 1 ] == "four"); + assert( StringArr[ 2 ] == "three"); + assert( StringArr[ 3 ] == "two"); + assert( StringArr.size( ) == 4); + } + +void testReserve( ) + { + std::cout << "Testing Reserve ... \n"; + Vector< int > testArr; + testArr.reserve( 5 ); + assert( testArr.size( ) == 0 ); + testArr.push_back( 7 ); + assert( testArr[ 0 ] == 7); + assert( testArr.size( ) == 1); + testArr.reserve( 6 ); + assert( testArr[ 0 ] == 7); + //TODO add more shit to this, probabaly with size_of( ) + } + +void testResize( ) + { + std::cout << "Testing Resize ... \n"; + Vector< int > testArr; + testArr.resize( 5 ); + assert( testArr.size( ) == 5 ); + assert( testArr[ 3 ] == 0 ); + testArr[ 3 ] = 42; + assert( testArr[ 3 ] == 42 ); + testArr.resize( 10 ); + assert( testArr[ 3 ] == 42); + testArr.push_back( 3 ); + assert( testArr[ 10 ] == 3); + } + +void testIterator( ) + { + std::cout<<"Testing Iterator...\n"; + + Vector< int > Arr; + Arr.push_back( 5 ); + + Vector< int >::Iterator iter = Arr.begin( ); + assert( *iter == 5 ); + Arr.push_back( 6 ); + ++iter; + assert( *iter == 6 ); + --iter; + assert( *iter == 5); + assert( iter != Arr.end( ) ); + ++++iter; + assert( iter == Arr.end( ) ); + } + +void testStruct( ) + { + std::cout << "Testing Struct...\n"; + struct Node + { + Node *next; + int val; + }; + Node example; + example.val = 7; + Vector< Node > container; + container.push_back(example); + assert(container[ 0 ].val == 7); + Node example2; + example2.val = 99; + container.resize( 5 ); + container[ 3 ] = example2; + assert( container[ 3 ].val == 99); + assert( container[ 0 ].val == 7); + assert( container[ 2 ].val == 0); + struct WordCount + { + std::string word; + int freq; + }; + + Vector< WordCount > history; + + } + +void testPointers( ) + { + std::cout << "Testing Pointers...\n"; + struct Node { + Node *next; + int val; + }; + Vector< Node* > ptrArr; + ptrArr.resize( 20 ); + ptrArr[ 9 ] = new Node( ); + ptrArr[ 9 ]->val = 9; + assert( ptrArr[ 9 ]->val == 9); + } diff --git a/ISRAnd-tests b/ISRAnd-tests index 2ee14d218d302687a79f88970eaee2dbd7b2d705..e73b68b74e1bb0323aed5e74f647dc1e03270e6c 100755 Binary files a/ISRAnd-tests and b/ISRAnd-tests differ diff --git a/ISRMultiWordORTest b/ISRMultiWordORTest new file mode 100755 index 0000000000000000000000000000000000000000..098733a2b9be4aa9b941249dd625551aeb3a6469 Binary files /dev/null and b/ISRMultiWordORTest differ diff --git a/ISROR-tests b/ISROR-tests index 1e1d2db6dce52b5624ac63836a68bf90eddfe924..bfa9a2e1f8a1be47367cc5a47322d18b37b7c146 100755 Binary files a/ISROR-tests and b/ISROR-tests differ diff --git a/ISRWord-Multi-Test b/ISRWord-Multi-Test index 00f28962a84882e0a1a3eb88e427b1a9cc5691ae..c29f3a5e6f67d2f14331a627c85cec1458c58279 100755 Binary files a/ISRWord-Multi-Test and b/ISRWord-Multi-Test differ diff --git a/ISRWord-tests b/ISRWord-tests deleted file mode 100755 index 20d015635eb96a3cc920b8a82826bd61231babd9..0000000000000000000000000000000000000000 Binary files a/ISRWord-tests and /dev/null differ diff --git a/constraintSolver/ISR.h b/constraintSolver/ISR.h index 98bcf1f4276f3b2e99aff210d1db3b0df7f91fd3..c7961e5383f8dc9905fd0bc8e229cab6512140db 100644 --- a/constraintSolver/ISR.h +++ b/constraintSolver/ISR.h @@ -29,15 +29,18 @@ class ISR public: // Returns - virtual Location First ( ) = 0; //Returns next post of a word given current location - virtual Location Next ( ) = 0; + //virtual Location Next ( ) = 0; //Calls seek onto one past the current end doc location //Return first instance of word at new document - virtual Location NextDocument ( ) = 0; + void NextDocument ( ) { + + currentLocation = Seek( GetEndDocument()->getCurrentDoc().docEndPosition + 1); + + } //Returns first instance of word after target location virtual Location Seek ( Location target ) = 0; @@ -47,10 +50,13 @@ public: virtual ISREndDoc * GetEndDocument ( ) = 0; + Location GetCurrentLocation ( ) + { + return currentLocation; + } - - Location currentLocation; + Location currentLocation = 0; ISREndDoc *DocumentEnd = new ISREndDoc ( ); diff --git a/constraintSolver/ISRAnd.cpp b/constraintSolver/ISRAnd.cpp index d5df6e357189dae5a2a884e37ccf983717944156..a5eb351cc692360ec3df24dfc8f9903093065b1e 100644 --- a/constraintSolver/ISRAnd.cpp +++ b/constraintSolver/ISRAnd.cpp @@ -27,41 +27,11 @@ ISRAnd::ISRAnd( vector < ISR * > InputTerms ) : Terms( InputTerms ) return; } -Location ISRAnd::First() - { - //Fixme? - /* - Location first = MAX_Location; - - for(auto isr : Terms) - { - Location temp = isr->First(); - if (temp < first) - { - first = temp; - } - } - - //fixme should this return the nearest location of one subterm or the nearest location all the terms match? - currentLocation = Seek(first); - return currentLocation; - */ - } -Location ISRAnd::Next() - { - return Seek( nearestStartLocation ); - } -Location ISRAnd::NextDocument() - { - currentLocation = Seek( GetEndDocument( )->getCurrentDoc( ).docEndPosition + 1 ); - return currentLocation; - } - Location ISRAnd::Seek( Location target ) { @@ -145,11 +115,5 @@ ISREndDoc *ISRAnd::GetEndDocument() return furthestTerm->DocumentEnd; } -Location ISRAnd::GetCurrentLocation() - { - //What does currentLocation hold? When is it updated? - //return DocumentEnd->Seek(currentLocation); - return currentLocation; - } diff --git a/constraintSolver/ISRAnd.h b/constraintSolver/ISRAnd.h index 5ee7169f8e3b0625275476e9d76e8f65210ca8dc..964dd7015d3731163ba87599eb75a896d0313b87 100644 --- a/constraintSolver/ISRAnd.h +++ b/constraintSolver/ISRAnd.h @@ -10,7 +10,7 @@ using namespace std; //Find occurrences of all child ISRs within a single document -class ISRAnd : ISR +class ISRAnd : public ISR { public: vector<ISR*>Terms; @@ -18,12 +18,8 @@ public: ISRAnd ( vector<ISR * > InputTerms ); - Location First ( ) override; - Location Next ( ) override; - Location NextDocument ( ) override; Location Seek ( Location target ) override; ISREndDoc * GetEndDocument ( ) override; - Location GetCurrentLocation(); diff --git a/constraintSolver/ISROr.cpp b/constraintSolver/ISROr.cpp index d3282a1727df2f678268f217f5dfa7bd74ee41c3..983f94617d41d2babd90fc4fcce6c4c16173df59 100644 --- a/constraintSolver/ISROr.cpp +++ b/constraintSolver/ISROr.cpp @@ -3,113 +3,78 @@ // #include "ISROr.h" +#include <cassert> - -ISROr::ISROr ( vector<ISR * > InputTerms ) : Terms( InputTerms ) +ISROr::ISROr( vector < ISR * > InputTerms ) : Terms( InputTerms ) { - for (auto currentTerm : InputTerms) - { - currentTerm->First(); - Location currentLocation = currentTerm->currentLocation; - if (currentLocation < nearestStartLocation) { - nearestTerm = currentTerm; - nearestStartLocation = currentLocation; + assert( InputTerms.size( ) > 1 ); - } - if (currentLocation > nearestEndLocation) { - nearestEndLocation = currentLocation; - } - ++NumberOfTerms; - currentTerm++; + Location first = MAX_Location; + for ( auto isr : Terms ) + { + Location temp = isr->currentLocation; + if ( temp < first ) + { + first = temp; + } } - } -Location ISROr::GetStartLocation ( ) - { - return nearestStartLocation; + //fixme should this return the nearest location of one subterm or the nearest location all the terms match? + currentLocation = Seek( first ); + return; } -Location ISROr::GetCurrentLocation(){ - return nearestStartLocation; - } -Location ISROr::GetEndLocation ( ) - { - return nearestEndLocation; - } -Location ISROr::First() - { - //Fixme - Location x; - return x; - } -/* -Returns the location of the next document that is a match -*/ -Location ISROr::Next ( ) +Location ISROr::Seek( Location target ) { - Location nearestEnd = this->nearestTerm->GetEndDocument( ); - for(auto Term : Terms) - { - Location newSeekLocation = Term->Seek( nearestEnd + 1 ); - if ( newSeekLocation < nearestStartLocation ) - { - nearestStartLocation = newSeekLocation; - nearestTerm = Term; - } - } - return this->nearestTerm->currentLocation; + //Todo + // 1. Seek all the ISRs to the first occurrence beginning at + // the target location. + // 2. Move the document end ISR to just past the furthest + // word, then calculate the document begin location. + // 3. Seek all the other terms to past the document begin. + // 4. If any term is past the document end, return to + // step 2. + // 5. If any ISR reaches the end, there is no match. - } - -Location ISROr::NextDocument() - { - //Fixme - Location x; - return x; - } + Location nearest = MAX_Location ; -Location ISROr::Seek ( Location target ) - { + //find nearest & furthest ISR + for ( auto isr : Terms ) + { + Location temp = isr->Seek( target ); + if ( temp < nearest ) + { + nearest = temp; + nearestTerm = isr; + } - // Seek all the ISRs to the first occurrence beginning at// the target location. Return null if there is no match. - // The document is the document containing the nearest term. - //seek past target locations, - //seek all terms in or past starting location, take the ones that nears - //the document that the nearest term is in is the document ur in - //updates private members + } + return nearest; - return 1; - } -Location ISROr::GetEndDocument() - { - //Fixme - Location x; - return x; } -/* -ISR *ISROr::GetCurrentEndDoc ( ) +ISREndDoc *ISROr::GetEndDocument() { + //What does currentLocation hold? When is it updated? + return nearestTerm->DocumentEnd; + } - return this->nearestTerm->GetDocumentISR( ); - } -*/ diff --git a/constraintSolver/ISROr.h b/constraintSolver/ISROr.h index 657c3fee37363b7665b2f43101688b26227bd38c..9a1fcc547e852ae2ee424c4c6055ebbd85b9eb95 100644 --- a/constraintSolver/ISROr.h +++ b/constraintSolver/ISROr.h @@ -6,33 +6,26 @@ #include "ISR.h" #include <vector> -// Find occurrences of any child ISR. + using namespace std; +//Find occurrences of all child ISRs within a single document + class ISROr : public ISR { public: - vector<ISR*>Terms; unsigned NumberOfTerms; ISROr ( vector<ISR * > InputTerms ); - Location First ( ) override; - Location Next ( ) override; - Location NextDocument ( ) override; Location Seek ( Location target ) override; - Location GetEndDocument ( ) override; + ISREndDoc * GetEndDocument ( ) override; - Location GetCurrentLocation(); - - Location GetStartLocation ( ); - Location GetEndLocation ( ); private: ISR *nearestTerm; - // nearStartLocation and nearestEndLocation are// the start and end of the nearestTerm. - Location nearestStartLocation, nearestEndLocation; }; + diff --git a/constraintSolver/ISRWord.cpp b/constraintSolver/ISRWord.cpp index 0d303e4e8233b9b0ccd9434892fc65d6ddb4af2f..a5547c392c4823d42ca3efdd1e12c8664ee1a2a5 100644 --- a/constraintSolver/ISRWord.cpp +++ b/constraintSolver/ISRWord.cpp @@ -11,7 +11,7 @@ size_t FileSize(int f) { ISRWord::ISRWord ( string word ) { term = word; - + frequency = 0; getChunks( ); if(listOfChunks.size( ) == 0) { @@ -180,6 +180,7 @@ Location ISRWord::Seek( Location target ) { if(target <= currentLocation) return currentLocation; + if(!wordSeekLookupTable.empty()) { auto best = wordSeekLookupTable.front(); for(auto entry : wordSeekLookupTable) { @@ -198,7 +199,6 @@ Location ISRWord::Seek( Location target ) { } } else { while(Next() <= target) { - } if( currentLocation == MAX_Location) return MAX_Location; @@ -209,18 +209,8 @@ Location ISRWord::Seek( Location target ) { } -Location ISRWord::NextDocument() - { - //FixMe - //seek the isr to the first location after the doc end - currentLocation = Seek( DocumentEnd->getCurrentDoc().docEndPosition + 1); - //update the doc end to the next doc end after the new seek position - return DocumentEnd->getCurrentDoc().docEndPosition; - - } - ISREndDoc * ISRWord::GetEndDocument() { //Fixme diff --git a/constraintSolver/ISRWord.h b/constraintSolver/ISRWord.h index d12e2dada797844e65d5ce985754ec12c96b595b..056a8472c7e9cfa34c6f2379515ea7a7b0d71192 100644 --- a/constraintSolver/ISRWord.h +++ b/constraintSolver/ISRWord.h @@ -26,9 +26,8 @@ class ISRWord : public ISR ISRWord ( string word ); - Location First ( ) override; - Location Next ( ) override; - Location NextDocument ( ) override; + Location First ( ) ; + Location Next ( ) ; Location Seek ( Location target ) override; ISREndDoc * GetEndDocument ( ) override; diff --git a/constraintSolver/tests/ISRAndTests.cpp b/constraintSolver/tests/ISRAndTests.cpp index 04d7a1990f05244cdf8dd18b5ac8d0bb420f3961..126a50b1aa6bf1097926679f7f6ff057b0f5e74e 100644 --- a/constraintSolver/tests/ISRAndTests.cpp +++ b/constraintSolver/tests/ISRAndTests.cpp @@ -23,11 +23,6 @@ int main ( ) ISRWord *q = new ISRWord( query.c_str() ); input.push_back(q); } - //ISRWord *q2 = new ISRWord("time"); - //ISRWord *q3 = new ISRWord("time"); - //ISRWord *q4 = new ISRWord("time"); - //ISRWord *q5 = new ISRWord("time"); - //ISRWord *q6 = new ISRWord("time"); diff --git a/constraintSolver/tests/ISRMultiWordORTest.cpp b/constraintSolver/tests/ISRMultiWordORTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9dc5876d5e1b1927d203f603b7f0303e800842be --- /dev/null +++ b/constraintSolver/tests/ISRMultiWordORTest.cpp @@ -0,0 +1,107 @@ +// +// Created by Jake Close on 4/6/18. +// + +// +// Created by nick on 3/16/18. +// + +#include <iostream> +#include <set> +#include "../../indexer/DocumentEnding.h" +#include "../ISRWord.h" +#include <iterator> + +#include <vector> +#include "../ISREndDoc.h" + + +using namespace std; + +int main ( ) + { + vector<ISRWord> queries; + vector< vector< size_t > > locations; + + /* + static const char TITLE = '#'; + static const char ANCHOR = '@'; + static const char URL = '$'; + static const char BODY = '%'; + static const char HOST = '='; + */ + + ISRWord q1 = ISRWord("chicken"); + ISRWord q2 = ISRWord("waffle"); + + + queries.push_back(q2); + queries.push_back(q1); + + + + + + vector<vector<string>> urls; + vector<string> out1; + vector<string> out2; + urls.push_back(out1 ); + urls.push_back(out2 ); + + set<string> or_set; + + vector<size_t> loc1; + vector<size_t> loc2; + locations.push_back(loc1); + locations.push_back(loc2); + + + vector<double> times; + for(int i = 0; i <queries.size() ; i++ ) + { + clock_t start = clock(); + while(queries[i].getCurrentLocation() != MAX_Location) { + auto url = queries[i].DocumentEnd->getCurrentDoc().url; + urls[i].push_back( url ); + queries[i].NextDocument(); + + } + clock_t end = clock(); + double time = (end - start) / (double) CLOCKS_PER_SEC; + times.push_back(time); + + } + + + + + + int i = 0; + for(auto output : urls) + { + cout << queries[i].term << endl; + cout << "Time to complete query: " << times[i] << endl; + for(auto urrl : output) { + cout << urrl << endl; + } + i++; + } + + + cout << "Printing Set Intersection " << endl; + vector<string> v1 = urls[0]; + vector<string> v2 = urls[1]; + + std::vector<string> v_intersection; + + std::set_union(v1.begin(), v1.end(), + v2.begin(), v2.end(), + std::inserter(v_intersection, v_intersection.begin())); + for(auto url : v_intersection) + std::cout << url << endl; + + cout << "Number of results" << v_intersection.size( ) << endl; + + + return 0; + } \ No newline at end of file diff --git a/constraintSolver/tests/ISROrTests.cpp b/constraintSolver/tests/ISROrTests.cpp index 3f982cb10d120ae34b2372232797e8439e5349db..e08f5c7d5719ceab8f82bf7776e642bd53019d5b 100644 --- a/constraintSolver/tests/ISROrTests.cpp +++ b/constraintSolver/tests/ISROrTests.cpp @@ -13,33 +13,57 @@ using namespace std; int main ( ) { - char* query; - ISRWord *q1 = new ISRWord("iphone"); - ISRWord *q2 = new ISRWord("apple"); + string query; vector< ISR* > input; - input.push_back(q1); - input.push_back(q2); + query = "!"; + while(cin >> query) + { + if(query == "-q") + break; + ISRWord *q = new ISRWord( query.c_str() ); + input.push_back(q); + } + + + + ISROr *queryOr = new ISROr(input); - ISREndDoc endDocs; - vector<size_t> locations; - vector<DocumentEnding> docEnds; set<string> urls; + clock_t start = clock(); + while(queryOr->GetCurrentLocation() != MAX_Location) { - locations.push_back(queryOr->Next()); - } - while(endDocs.next().url != "aaa") - { - for(auto locs : locations) - { - if(locs < endDocs.getCurrentDoc().docEndPosition && - locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) { - urls.insert(endDocs.getCurrentDoc().url); - } - } + + auto url = queryOr->GetEndDocument()->getCurrentDoc().url; + if(url == "file59tweet151421970293063681") + cout << "PREBUG"; + cout << url << endl; + urls.insert(url); + queryOr->NextDocument(); } + + clock_t end = clock(); + + for(auto urrl : urls) { cout << urrl << endl; } + + cout << "Time to complete query: " << (end - start) / (double) CLOCKS_PER_SEC << endl; + cout << "Number of results: " << urls.size(); + + /* + * + * moment and life + file3tweet151407709667856384 + file5tweet151408405939093504 + file9tweet151409353818255361 + * + * + * token and life + * file48tweet151419172700684288 + */ + + return 0; } \ No newline at end of file diff --git a/constraintSolver/tests/ISRWordMultiTest.cpp b/constraintSolver/tests/ISRWordMultiTest.cpp index f6315a7a5d8d551f325639ce934808e547c14d00..4e94590b33e0201175171dfd73c3a0c9b784b6e1 100644 --- a/constraintSolver/tests/ISRWordMultiTest.cpp +++ b/constraintSolver/tests/ISRWordMultiTest.cpp @@ -27,8 +27,8 @@ int main ( ) static const char HOST = '='; */ - ISRWord q1 = ISRWord("#trump"); - ISRWord q2 = ISRWord("%trump"); + ISRWord q1 = ISRWord("moment"); + ISRWord q2 = ISRWord("life"); queries.push_back(q2); @@ -36,7 +36,6 @@ int main ( ) - vector<DocumentEnding> docEnds; vector<vector<string>> urls; @@ -97,6 +96,7 @@ int main ( ) for(auto url : v_intersection) std::cout << url << endl; + cout << "Number of results" << v_intersection.size( ) << endl; return 0; } \ No newline at end of file diff --git a/constraintSolver/tests/ISRWordTests.cpp b/constraintSolver/tests/ISRWordTests.cpp index 3990f658f7a418fd59c6a2b4d36277791ca99a1c..15e5ed6f2af8c3e50eab729a0f828158f3c9d315 100644 --- a/constraintSolver/tests/ISRWordTests.cpp +++ b/constraintSolver/tests/ISRWordTests.cpp @@ -6,6 +6,7 @@ #include <set> #include "../../indexer/DocumentEnding.h" #include "../ISRWord.h" +#include "../../util/Stemmer.h" #include "../ISREndDoc.h" @@ -22,13 +23,17 @@ int main ( ) { decorators.push_back("@"); decorators.push_back("$"); + Stemmer stem; + /* for(auto dec : decorators) { - ISRWord queryWord( dec + "trump"); + ISRWord queryWord( dec + stem.execute("world") ) ; queries.push_back(queryWord); } - +*/ + ISRWord queryWord( "world" ) ; + queries.push_back(queryWord); vector<size_t> locations; set<string> urls; @@ -40,7 +45,7 @@ int main ( ) { auto url = query.DocumentEnd->getCurrentDoc().url; urls.insert( url ); query.NextDocument(); - cout << url << endl; + } } diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 1be9273d1878e93b02987834165f28dea8795a85..17d4e37d4410fd774f5f673f3c44a3063671b8f8 100755 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -41,7 +41,7 @@ void Indexer::run() } chunkDictionary[word.first].docFrequency++; indexedCount += word.second.size( ); - currentBlockNumberWords += word.second.size( ); + currentBlockNumberWords += word.second.size( ); for ( auto location : word.second ) { diff --git a/query/Ranker/Ranker.cpp b/query/Ranker/Ranker.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a636c05e554b36bff2b89e3668705b7409cac94e --- /dev/null +++ b/query/Ranker/Ranker.cpp @@ -0,0 +1,46 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + +#include "Ranker.h" +#include "Site.h" +#include "../../constraintSolver/ISRWord.h" +#include "../../constraintSolver/ISREndDoc.h" +#include <vector> +#include <queue> +#include <string> +#include <set> + +/*** + * TODO: Finalize how we want this to work + * Initializes the ranker with the Word ISRs from the query, will most likely scale to add more input, + * could possibly pull the information, depends on how much control we want + */ +void Ranker::init ( vector<ISRWord> query ) + { + inputQuery = query; + } + +/*** + * This will perform the "sorting" and return a list of ranked URLS + * @return + */ +vector< string > Ranker::rank ( ) + { + ISREndDoc endDocs; + vector<size_t> locations; + vector<DocumentEnding> docEnds; + set<string> urls; + + + for( auto queryWord = inputQuery.begin(); queryWord < inputQuery.end(); ++queryWord) + { + while ( queryWord->getCurrentLocation ( ) != MAX_Location ) + { + auto url = queryWord->DocumentEnd->getCurrentDoc ( ).url; + urls.insert ( url ); + queryWord->NextDocument ( ); + + } + } + } \ No newline at end of file diff --git a/query/Ranker/Ranker.h b/query/Ranker/Ranker.h new file mode 100644 index 0000000000000000000000000000000000000000..c1bd00523f6fe09b79e713834fa9f46113de8507 --- /dev/null +++ b/query/Ranker/Ranker.h @@ -0,0 +1,53 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + + +#ifndef EECS398_SEARCH_RANKER_H +#define EECS398_SEARCH_RANKER_H + + +#include <queue> +#include "../../constraintSolver/ISRWord.h" +#include "Site.h" + +/*** + * Custom Comparator for the priority queue that keeps the websites in their correct order. + */ +class Comp + { +public: + bool operator()(Site L, Site R) + { + return L.getScore() > R.getScore(); + } + }; + +class Ranker + { +public: + + Ranker(){}; + + Ranker( vector< ISRWord > query ){ + init( query ); + }; + vector< string > rank(); +private: + + void init( vector< ISRWord> query ); + + //Queue to continuously sort the sites + priority_queue< Site, vector< Site> , Comp > WebsiteQueue; + vector< ISRWord > inputQuery; + + //TODO: Not sure if we will need these + vector< string > urls; + vector< Site > Websites; + + + }; + + +#endif //EECS398_SEARCH_RANKER_H + diff --git a/query/Ranker/Scorer.cpp b/query/Ranker/Scorer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a9879d6ac3cc9934333e509b297d9313684ffc5 --- /dev/null +++ b/query/Ranker/Scorer.cpp @@ -0,0 +1,27 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + +#include "Scorer.h" +#include "Site.h" +#include <vector> + +/*** + * Calculate the score for some site, Normalize the score to 1.0 + * @return + */ +double Scorer::getScore ( Site website) + { + double score = 0.0; + int numberOfFunctions = 1; + + //Repeat for each function + score += Simple( website )*SIMPLE_WEIGHT; + + return score / (double)numberOfFunctions; + } + +double Scorer::Simple( Site ) + { + return 42.0; + } \ No newline at end of file diff --git a/query/Ranker/Scorer.h b/query/Ranker/Scorer.h new file mode 100644 index 0000000000000000000000000000000000000000..5cd655c65411d4228b9b1acc0497589eed089882 --- /dev/null +++ b/query/Ranker/Scorer.h @@ -0,0 +1,25 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + +#ifndef EECS398_SEARCH_SCORER_H +#define EECS398_SEARCH_SCORER_H + +#include "Site.h" + +class Scorer + { +public: + Scorer() + { + } + + double getScore( Site ); + +private: + + //Simple example heuristic + double Simple( Site ); + const double SIMPLE_WEIGHT = 1.0; + }; +#endif //EECS398_SEARCH_SCORER_H diff --git a/query/Ranker/Site.cpp b/query/Ranker/Site.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ca01dee5e15c8c6037c7d9fdf2846d2e14a7b452 --- /dev/null +++ b/query/Ranker/Site.cpp @@ -0,0 +1,25 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + +#include "Site.h" +#include "Scorer.h" +#include <string> +#include <vector> + +/*** + * Returns the score of some site. Either grabs the value if its been scored previously, or generates a new score using + * a scorer object. + * @return + */ +double Site::getScore ( ) + { + if( this->hasBeenScored ) + { + return this->score; + } + + hasBeenScored = true; + Scorer rank = Scorer(); + return rank.getScore( *this ); + } \ No newline at end of file diff --git a/query/Ranker/Site.h b/query/Ranker/Site.h new file mode 100644 index 0000000000000000000000000000000000000000..95b57dce9ead070566fdebe30688ba452738d4bb --- /dev/null +++ b/query/Ranker/Site.h @@ -0,0 +1,20 @@ +// +// Created by Zane Dunnings on 4/2/18. +// +/*** + * Class to represent each website in the ranking engine + */ +#ifndef EECS398_SEARCH_SITE_H +#define EECS398_SEARCH_SITE_H + +#include <string> +class Site + { +public: + + double getScore(); + std::string url; + double score; + bool hasBeenScored; + }; +#endif //EECS398_SEARCH_SITE_H diff --git a/query/Ranker/tests/RankerTest.cpp b/query/Ranker/tests/RankerTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..29588f9e982767937aa61d9f53217a0e7d4626f7 --- /dev/null +++ b/query/Ranker/tests/RankerTest.cpp @@ -0,0 +1,38 @@ +// +// Created by Zane Dunnings on 4/2/18. +// + +#include "../Ranker.h" +#include <iostream> +#include <set> + + +void testSimple(); + +int main() + { + cout << "------Starting Ranker Test------" << endl; + testSimple (); + cout << "------Passed All Ranker Tests---" << endl; + } + +void testSimple() + { + //Initialize Ran + ISRWord queryWord("%trump"); + ISREndDoc endDocs; + vector<size_t> locations; + vector<DocumentEnding> docEnds; + set<string> urls; + + while( queryWord.getCurrentLocation() != MAX_Location ) + { + auto url = queryWord.DocumentEnd->getCurrentDoc().url; + urls.insert( url ); + queryWord.NextDocument(); + + } + + } + + diff --git a/query/queryLanguage/QueryParser.cpp b/query/queryLanguage/QueryParser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e42ee1c30e9886cd78d6cd052e2bedd03eda93d8 --- /dev/null +++ b/query/queryLanguage/QueryParser.cpp @@ -0,0 +1,426 @@ +// +// Created by Zane Dunnings on 3/16/18. +// + +//Outline of query language from Prof. Nicole Hamilton, University of Michigan 03/15/2018 +// 72 lines +#include "QueryParser.h" +#include<unordered_set> +#include "../../util/stringProcessing.h" +#include<iostream> + +/*** + * Returns a token of the next word in the query, past the given index + * @param index + * @return + */ +Token QueryParser::FindNextToken( int &index ){ + //TODO remove this when you add new ISR + unordered_set<char> stopChars; + stopChars.insert(' '); + + int size = 1; + int start = index; + + + while(start + size < query.size()) + { + if ( query[ start + size ] == '"' ) + { + ++size; + while( query[start + size ]!= '"' && (start + size < query.size()) ) + { + ++size; + } + if(start + size < query.size()) + ++size; + index = start + size; + string text = query.substr ( start, size ); + if( MatchOR ( text ) ) + return Token( "-OR-" ); + return Token( text ); + } + else if ( stopChars.count( query[ start + size ] ) > 0) + { + index = start + size; + string text = query.substr ( start, size ); + cout << "horse" << text; + + return Token( text ); + } + else + { + ++size; + } + } + index = start + size; + string text = query.substr ( start, size ); + cout << "horsey: " << text; + + return Token( text ); + } + +/*** Builds QueryTree from input query + * + * @param input + */ +void QueryParser::parse( string input ) + { + query = input; + Token current; + int location = 0; + while( location < input.size( ) ) + { + //TODO needs to be BF Traversal + current = FindNextToken( location ); + Tuple * next = new Tuple( current ); + queryTree->Next.push_back( next ); + + } + } + +/*** + * destructor for the Query Parser + */ +QueryParser::~QueryParser ( ) + { + delete_children ( queryTree ); + delete queryTree; + } + +/*** + * Traverses down the tree and deletes all of the nodes in the tree + * @param node + */ +void QueryParser::delete_children( Tuple* node ) + { + for( int i = 0; i < node->Next.size( ); ++i ) + { + delete_children( node->Next[ i ] ); + delete node->Next[ i ]; + } + } + +/*** + * Prints the compiled Query for testing + */ +void QueryParser::printCompiledQuery() + { + cout << "Query Tree: \n"; + deque<Tuple *> queue; + deque<int> levelQueue; + queue.push_back( queryTree ); + levelQueue.push_back( 0 ); + traverse( queue, levelQueue ); + } + + +void QueryParser::traverse(deque< Tuple*> queue, deque< int> levels) + { + int deepest = 0; + while(!queue.empty()) + { + Tuple *current = queue.front ( ); + queue.pop_front ( ); + int currLevel = levels.front(); + levels.pop_front (); + for ( int i = 0; i < current->Next.size ( ); ++i ) + { + queue.push_back( current->Next[ i ] ); + levels.push_back( currLevel + 1); + } + cout << " | "; + if( currLevel > deepest) + { + deepest = currLevel; + cout << "\n[ "<<deepest<<" ] "; + } + + cout << " " << current->object.text << " "; + } + } + +/*** + * Returns whether or not the input string is a conditional OR type + * @param input + * @return + */ +bool QueryParser::MatchOR( string input ) + { + unordered_set<string> ORMatch; + ORMatch.insert("OR"); + ORMatch.insert("|"); + ORMatch.insert("||"); + ORMatch.insert("or"); + + if( ORMatch.count( input ) > 0 ) + { + return true; + } + return false; + } + +/*** + * Returns whether or not the input string is a conditional OR type + * @param input + * @return + */ +bool QueryParser::MatchAND( string input ) + { + unordered_set<string> ORMatch; + ORMatch.insert("AND"); + ORMatch.insert("&"); + ORMatch.insert("&&"); +3ww ORMatch.insert("and"); + + if( 3ORMatch.count( input ) > 0 ) + { + return true; + } + return false; + } + +/*** + * Highest level query parsing, splits the input string on OR, then builds tree subtrees without + * @param input + */ +Tuple* QueryParser::Constraint( string input ) + { + vector<Tuple * > constraintList; + Tuple *t = new Tuple(); + constraintList = breakOnOR( input ); + + + if( constraintList.size( ) > 1 ) + t->Type = OrTupleType; + else + t->Type = AndTupleType; + t->Next = constraintList; + + //Iterate through the subcontraints and if there are ORs, then run this again, else split on and for each + for (int i = 0; i < constraintList.size( ); ++i ) + { + string word =constraintList[ i ]->object.text; + //If the subtype needs an or, then build a new or tuple + if(isOrType(word)) + { + Tuple* toBeKilled = constraintList[ i ]; + constraintList[ i ] = Constraint ( word ); + constraintList[ i ]->Type = OrTupleType; + delete toBeKilled; + toBeKilled = nullptr; + } + else if(isAndType(word)) + { + Tuple* toBeKilled = constraintList[ i ]; + constraintList[ i ] = Constraint ( word ); + constraintList[ i ]->Type = AndTupleType; + delete toBeKilled; + toBeKilled = nullptr; + } + } + + + + } + + +/*** + * Breaks input string on ORs, returns a list of tuples of those strings + * E.G. hello | (bye OR goodbye) hola -> [ 'hello', '(bye OR goodbye) hola' ] + * @param input + * @return + */ +vector<Tuple * > QueryParser::breakOnOR( string input ) + { + int depth = 0; + + //TODO: use these to cover different types of nested brackets with a couple queues + unordered_set<char> openBracket; + openBracket.insert('('); + openBracket.insert('{'); + openBracket.insert('['); + + unordered_set<char> closedBracket; + closedBracket.insert(')'); + closedBracket.insert('}'); + closedBracket.insert(']'); + vector<string> query = splitStr (input, ' ', 0); + + vector<Tuple *> constraintList; + int start = 0; + for( int i = 0; i < query.size( ); ++i ) + { + //TODO: remove the parenths matching, just return the biggest string possible + if( query[ i ] == "(") + { + ++depth; +// ++depth; +// ++i; +// start = i; +// while(depth != 0 && ( i < query.size()) ) +// { +// if (query[ i ] == "(") +// ++depth; +// else if (query[ i ] == ")") +// --depth; +// if (depth == 0) +// { +// --i; +// break; +// } +// else +// ++i; +// } +// if (i == query.size()) +// i = query.size() - 1; +// string text = ""; +// for ( int j = start; j < i; ++ j) +// { +// text+= query[ j ]; +// } +// Tuple * t = new Tuple( text ); +// constraintList.push_back( t ); +// t->Type = AndTupleType; + } + else if( query[ i ] == ")") + { + --depth; + } + else if( MatchOR( query[ i ]) && depth == 0 ) + { + string text = query[ 0 ]; + for ( int j = start; j < i; ++ j) + { + text+= query[ j ]; + } + Tuple * subConstraint = new Tuple( text ); + constraintList.push_back( subConstraint ); + start = i + 1; + } + else if( i == query.size( ) - 1 ) + { + string text; + for ( int j = start; j < i; ++ j) + { + text+= query[ j ]; + } + Tuple * subConstraint = new Tuple( text ); + constraintList.push_back( subConstraint ); + } + } + return constraintList; + } + +Tuple * baseConstraint( string input ) + { +// while( t = simpleConstraint ( input )) + return nullptr; + } + +/*** + * Returns if a string has an OR at its highest level + */ +bool QueryParser::isOrType( string input ) + { + vector<string> query = splitStr (input, ' ', 0); + int depth = 0; + for( auto word = query.begin(); word != query.end(); ++word ) + { + if(depth == 0 && MatchOR(*word)) + { + return true; + } + if(*word == "(") + { + ++depth; + } + else if(*word == ")") + { + --depth; + } + } + return false; + } + +/*** + * Returns if a string has an OR at its highest level + */ +bool QueryParser::isAndType( string input ) + { + vector<string> query = splitStr (input, ' ', 0); + int depth = 0; + for( auto word = query.begin(); word != query.end(); ++word ) + { + if(depth == 0 && MatchAND(*word)) + { + return true; + } + if(*word == "(") + { + ++depth; + } + else if(*word == ")") + { + --depth; + } + + + + } + return false; + } + +vector<Tuple * > QueryParser::breakOnAND( string input ) + { + int depth = 0; + + //TODO: use these to cover different types of nested brackets with a couple queues + unordered_set<char> openBracket; + openBracket.insert('('); + openBracket.insert('{'); + openBracket.insert('['); + + unordered_set<char> closedBracket; + closedBracket.insert(')'); + closedBracket.insert('}'); + closedBracket.insert(']'); + vector<string> query = splitStr (input, ' ', 0); + + vector<Tuple *> constraintList; + int start = 0; + for( int i = 0; i < query.size( ); ++i ) + { + //TODO: remove the parenths matching, just return the biggest string possible + if( query[ i ] == "(") + { + ++depth; + } + else if( query[ i ] == ")") + { + --depth; + } + else if( MatchAND( query[ i ]) && depth == 0 ) + { + string text = query[ 0 ]; + for ( int j = start; j < i; ++ j) + { + text+= query[ j ]; + } + Tuple * subConstraint = new Tuple( text ); + constraintList.push_back( subConstraint ); + start = i + 1; + } + else if( i == query.size( ) - 1 ) + { + string text; + for ( int j = start; j < i; ++ j) + { + text+= query[ j ]; + } + Tuple * subConstraint = new Tuple( text ); + constraintList.push_back( subConstraint ); + } + } + return constraintList; + } \ No newline at end of file diff --git a/query/queryLanguage/QueryParser.h b/query/queryLanguage/QueryParser.h new file mode 100644 index 0000000000000000000000000000000000000000..53735b9bafe8df115e1f3943da9e0b97b7749050 --- /dev/null +++ b/query/queryLanguage/QueryParser.h @@ -0,0 +1,72 @@ +// +// Created by Zane Dunnings on 3/16/18. +// + +#ifndef EECS398_SEARCH_QUERYPARSER_H +#define EECS398_SEARCH_QUERYPARSER_H + +#include "../../util/DataStructureLib/tuple.cpp" +#include<deque> +// Outline of query language from Prof. Nicole Hamilton, University of Michigan 03/15/2018 +// 41 lines + + +// <Constraint> ::= <BaseConstraint> +// { <OrOp> <BaseConstraint> } +// +// +// <OrOp> ::= 'OR' | '|' | '||' +// +// <BaseConstraint> ::= <SimpleConstaint> +// { [ <AndOp> ] <SimpleConstraint> } +// +// <AndOp> ::= 'AND' | '&' | '&&' +// +// <SimpleConstraint> ::= <Phrase> | <NestedConstraint> | +// <UnaryOp> <SimpleConstraint> | +// <SearchWord> +// +// <UnaryOp> ::= '+' | '-' | 'NOT' +// +// <Phrase> ::= '"' { <SearchWord> } '"' +// +// <NestedConstraint> ::= '(' <Constraint> ')' + + +class QueryParser + { + +public: + QueryParser( ) + :queryTree( nullptr ), query( "" ){ + queryTree = new Tuple( TupleType::AndTupleType ); + } + //QueryParser( string query ); + + void parse( string input ); + + Token FindNextToken( int &index ); + Tuple * Constraint( string input ); + vector<Tuple * > breakOnOR( string input ); + vector<Tuple * > breakOnAND( string input ); + + void printCompiledQuery( ); + + ~QueryParser ( ); + + + Tuple* queryTree; + string query; +private: + void traverse(deque< Tuple*> queue, deque< int> levels); + void delete_children( Tuple* node ); + bool MatchOR( string input ); + bool MatchAND( string input ); + bool isAndType( string input ); + bool isOrType( string input ); + + + + }; + +#endif //EECS398_SEARCH_QUERYPARSER_H diff --git a/query/queryLanguage/tests/testQueryParser.cpp b/query/queryLanguage/tests/testQueryParser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5132c78407240cc61e8e3fabcae7df6875a6a298 --- /dev/null +++ b/query/queryLanguage/tests/testQueryParser.cpp @@ -0,0 +1,21 @@ +// +// Created by Zane Dunnings on 3/17/18. +// +//TODO Remove STL +#include "../QueryParser.h" +#include<iostream> +#include <fstream> + +int main() + { + string query = "apollo moon OR landing"; + QueryParser parser; + parser.parse( query ); + parser.printCompiledQuery(); + + string query1 = " \"apollo moon\" landing"; + QueryParser parser1; + parser1.parse( query1 ); + parser1.printCompiledQuery (); + + } \ No newline at end of file