removed the on disk doc map look upstuff, and created an isolated crawler test

3cc58b8e · jsclose · 8b26f280 · 3cc58b8e · 3cc58b8e · 3cc58b8e
Commit 3cc58b8e authored 7 years ago by jsclose
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,27 @@ add_executable(crawler-parser-indexer-test
        util/stringProcessing.cpp
        indexer/Indexer.cpp)

+
+
+add_executable(isolated-integration
+        crawler/tests/crawlerTest.cpp
+        shared/ProducerConsumerQueue.h
+        shared/ThreadClass.h
+        shared/url.h
+        crawler/crawler.cpp
+        crawler/Readers/StreamReader.h
+        crawler/Readers/HttpReader.cpp
+        crawler/Readers/HttpsReader.cpp
+        crawler/Readers/LocalReader.cpp
+        crawler/spider.cpp
+        util/util.cpp
+        shared/Document.cpp
+        parser/Parser.cpp
+        util/Stemmer.cpp
+        util/Tokenizer.cpp
+        util/stringProcessing.cpp
+        indexer/Indexer.cpp)
+
 add_executable(StringProcessingTest
        util/stringProcessing.cpp
        util/Stemmer.cpp
@@ -92,6 +113,8 @@ find_package(OpenSSL REQUIRED)

 target_link_libraries(ParserTest OpenSSL::SSL)

+target_link_libraries(isolated-integration OpenSSL::SSL pthread)
+
 target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
 target_link_libraries(crawler-parser-indexer-test OpenSSL::SSL pthread)


--- a/crawler/crawler.cpp
+++ b/crawler/crawler.cpp
@@ -4,18 +4,32 @@

 #include "crawler.h"

-void Crawler::SpawnSpiders ( size_t num_spiders, unordered_map< string, int > *docMapLookup,
-                             unordered_map< size_t, int > *duplicateUrlMap )
+/*
+ *
+ * @parms number of spiders
+ * Creates a number of spiders and starts new threads for them
+ *
+ */
+
+
+void Crawler::SpawnSpiders ( size_t num_spiders,
+                             unordered_map< size_t, int > *duplicateUrlMap
+									)
 	{
 	for ( size_t i = 0; i < num_spiders; i++ )
 		{
-		Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap, this->IndexerQueue );
+		Spider *temp = new Spider( this->mode, this->urlFrontier , duplicateUrlMap, this->IndexerQueue );
 		temp->StartThread( );
 		this->spiders.push_back( temp );
 		}

 	}

+/*
+ *
+ *Function to wait for all of the threads to finish running
+ *
+ */
 void Crawler::WaitOnAllSpiders ( )
 	{
 	cout << "Waiting for spiders to finish...\n";

--- a/crawler/crawler.h
+++ b/crawler/crawler.h
@@ -17,13 +17,16 @@ class Crawler
 	{

 public:
-	Crawler ( string mode_in, ProducerConsumerQueue< ParsedUrl > *url_q_in,
-	          ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
-			: IndexerQueue( doc_index_queue_in ), mode( mode_in ), urlFrontier( url_q_in )
+	Crawler ( string mode_in,
+				 ProducerConsumerQueue< ParsedUrl > *url_q_in,
+				 ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
+			: IndexerQueue( doc_index_queue_in ),
+			  mode( mode_in ),
+			  urlFrontier( url_q_in )
 		{ };

 	//spawns a number of works
-	void SpawnSpiders ( size_t num_spiders, unordered_map< string, int > *docMapLookup,
+	void SpawnSpiders ( size_t num_spiders,
 	                    unordered_map< size_t, int > *duplicateUrlMap );

 	//Creates a housekeeping thread

--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -79,7 +79,7 @@ void Spider::run ( )
 	std::cout << "Spider is crawling" << endl;
 	int cond = 0;

-	while ( cond < 25 )
+	while ( cond < true )
 		{
 		ParsedUrl currentUrl = getUrl( );
 		size_t docID = hash( currentUrl.CompleteUrl );
@@ -122,8 +122,11 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
 * Takes in a parsed url,  creates a document object, writes information about the document to disk
 *  returns the begining position of the document on disk, stores that into the in memory lookup hash table
 */
+
+
 bool Spider::writeDocToDisk ( ParsedUrl url )
 	{
+	/*
 	Document d( url );
 	int resultPosition = d.WriteToDocMap( );
 	if ( resultPosition == -1 )
@@ -136,7 +139,7 @@ bool Spider::writeDocToDisk ( ParsedUrl url )
 		{
 		std::cout << it->first << " => " << it->second << '\n';
 		}
-
+	*/
 	return true;
 	}


--- a/crawler/spider.h
+++ b/crawler/spider.h
@@ -24,13 +24,11 @@ public:

 	Spider ( string mode_in,
 	         ProducerConsumerQueue< ParsedUrl > *url_q_in,
-	         unordered_map< string, int > *doc_map_lookup_in,
 	         unordered_map< size_t, int > *duplicate_url_map_in,
 	         ProducerConsumerQueue< DocIndex * > *doc_index_queue_in
 	)
 			: mode( mode_in ),
 			  urlFrontier( url_q_in ),
-			  docMapLookup( doc_map_lookup_in ),
 			  parser( url_q_in ),
 			  duplicateUrlMap( duplicate_url_map_in ),
 			  IndexerQueue( doc_index_queue_in )
@@ -60,7 +58,6 @@ private:
 	ProducerConsumerQueue< DocIndex * > *IndexerQueue;
 	unordered_map< size_t, int > *duplicateUrlMap;
 	string mode;
-	unordered_map< string, int > *docMapLookup;
 	Parser parser;

 	};
\ No newline at end of file
--- a/crawler/tests/crawlerTest.cpp
+++ b/crawler/tests/crawlerTest.cpp
+//
+// Created by Jake Close on 3/21/18.
+//
+
+#include "../crawler.h"
+#include "../spider.h"
+#include <iostream>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <queue>
+#include "../../crawler/crawler.h"
+#include <openssl/ssl.h>
+#include <string>
+#include <unordered_map>
+#include "../../util/util.h"
+#include <getopt.h>
+#include "../../indexer/Indexer.h"
+
+
+
+using DocIndex = const unordered_map< string, vector< unsigned long > >;
+
+using namespace std;
+
+
+int main ( int argc, char *argv[] )
+	{
+
+	int numberOfSpiders = 1;
+	unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( );
+	ProducerConsumerQueue < ParsedUrl > *urlFrontier = new ProducerConsumerQueue < ParsedUrl >( );
+	ProducerConsumerQueue < DocIndex * > *IndexerQueue = new ProducerConsumerQueue < DocIndex * >( );
+	Indexer indexer( IndexerQueue );
+	indexer.StartThread( );
+	string mode = "web";
+	Crawler crawler( mode, urlFrontier, IndexerQueue );
+
+	crawler.SpawnSpiders( numberOfSpiders , duplicateUrlMap );
+
+	crawler.WaitOnAllSpiders( );
+	indexer.WaitForFinish( );
+
+
+	auto f = urlFrontier->Pop( );
+	int x = 0;
+	delete urlFrontier;
+	return 1;
+	}
\ No newline at end of file
--- a/isolated-integration
+++ b/isolated-integration
--- a/main.cpp
+++ b/main.cpp
@@ -132,7 +132,6 @@ int main ( int argc, char *argv[] )
 		ParsedUrl url = ParsedUrl( testFile );
 		urlFrontier->Push( url );
 		}
-	unordered_map< string, int > *docMapLookUp = new unordered_map< string, int >( );


 	Indexer indexer( IndexerQueue );
@@ -140,7 +139,7 @@ int main ( int argc, char *argv[] )

 	Crawler crawler( mode, urlFrontier, IndexerQueue );

-	crawler.SpawnSpiders( numberOfSpiders, docMapLookUp, duplicateUrlMap );
+	crawler.SpawnSpiders( numberOfSpiders, duplicateUrlMap );

 	crawler.WaitOnAllSpiders( );
 	indexer.WaitForFinish( );

--- a/tests/webSeed.txt
+++ b/tests/webSeed.txt
+https://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:cars
 https://wikipedia.org/wiki/71st_British_Academy_Film_Awards
 http://www.bbc.com/
 https://www.eecs.umich.edu/