diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c0f522fae5c966668a50c56ee824104a6f9eb93..5dcc7210cb225b59ca195fe34c54ed3a43670595 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,16 +3,23 @@ project(eecs398_search) set(CMAKE_CXX_STANDARD 11) -add_executable(eecs398_search +add_executable(search main.cpp shared/ProducerConsumerQueue.cpp shared/ProducerConsumerQueue.h - shared/ProducerConsumerQueue_test.cpp parser/Parser.h - parser/Parser.h + parser/Parser.h util/Tokenizer.h util/stringProcessing.h util/Stemmer.h) + + +add_executable(crawl main.cpp shared/ProducerConsumerQueue.h shared/ThreadClass.h shared/url.h crawler/crawler.cpp crawler/SocketReader.cpp crawler/StreamReader.h crawler/spider.cpp util/util.h crawler/LocalReader.h crawler/StreamReader.h parser/Parser.h shared/Document.cpp parser/Parser.cpp) + + +add_executable(test1 main.cpp shared/ProducerConsumerQueue.h + shared/ThreadClass.h crawler/crawler.cpp crawler/spider.cpp shared/url.h crawler/StreamReader.h util/util.cpp crawler/SocketReader.cpp crawler/SocketReader.h crawler/LocalReader.h ) + add_executable(StringProcessingTest util/stringProcessing.h util/Stemmer.h @@ -32,3 +39,10 @@ add_executable(ParserEndToEndTest shared/url.h parser/tests/parserTest.cpp) + + +find_package(OpenSSL REQUIRED) + +target_link_libraries(crawl OpenSSL::SSL) + +target_link_libraries(test1 OpenSSL::SSL) \ No newline at end of file diff --git a/ParserEndToEndTest b/ParserEndToEndTest new file mode 100755 index 0000000000000000000000000000000000000000..14ae4eaf767c4f83f33fc6ef6ca5de9218f6345b Binary files /dev/null and b/ParserEndToEndTest differ diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 018f12143eb670f7e032d26866e0767640b4872d..a19d25728d4b45a01df5b7a28dfd6302a247f575 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -6,18 +6,18 @@ #include "spider.h" +#include "../parser/Parser.h" #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <unistd.h> -#include "../util/util.h" #include "LocalReader.h" #include "SocketReader.h" #include "../shared/Document.h" - +#include "../util/util.h" size_t Spider::hash(const char * s){ { // http://www.cse.yorku.ca/~oz/hash.html @@ -62,7 +62,11 @@ void Spider::FuncToRun() string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + to_string(docID)+ ".txt"; int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk); - //parser.parse(reader); + + /* + Document document ( currentUrl, reader->buffer ); + auto dictionary = parser.execute ( &document ); + */ cond = true; } else @@ -87,7 +91,10 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec */ - +/* + * Takes in a parsed url, creates a document object, writes information about the document to disk + * returns the begining position of the document on disk, stores that into the in memory lookup hash table +*/ bool Spider::writeDocToDisk(ParsedUrl url) { Document d(url); @@ -103,6 +110,12 @@ bool Spider::writeDocToDisk(ParsedUrl url) return true; } +/* + * + * Takes a parsed url, checks if its in the local in memory hash table of documents return false + * If url was crawled but past a certain point, reindexs or does not exist , indexes the doc + * and returns true + */ bool Spider::shouldURLbeCrawled( ParsedUrl url ) diff --git a/crawler/spider.h b/crawler/spider.h index a5d2a501e0f062ef4b565b6522f309107b2189ec..e1a99a2abd9d463516a7bc930e3d75ae1b6b4559 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -9,6 +9,8 @@ #include<iostream> #include <unordered_map> #include "StreamReader.h" +#include "../parser/Parser.h" +#include "../util/util.h" using namespace std; @@ -20,8 +22,9 @@ public: Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in, unordered_map < string, int > *doc_map_lookup_in ) - : mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ) - { }; + : mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( Parser( url_q_in)) + { + }; //Takes a url off of the url frontier @@ -45,6 +48,7 @@ public: private: int locationOnDisk; + Parser parser; ProducerConsumerQueue < string > *urlFrontier; string mode; unordered_map < string, int > *docMapLookup; diff --git a/makefile b/makefile index d639ab851862209c711be2d31cf4d16a367985c4..363ce8c2c5cc0ad19b036816ccb8e05b89135ecd 100644 --- a/makefile +++ b/makefile @@ -56,17 +56,6 @@ CMAKE_BINARY_DIR = /Users/jakeclose/Desktop/398/project/eecs398-search #============================================================================= # Targets provided globally by CMake. -# Special rule for the target rebuild_cache -rebuild_cache: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." - /Applications/CLion.app/Contents/bin/cmake/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) -.PHONY : rebuild_cache - -# Special rule for the target rebuild_cache -rebuild_cache/fast: rebuild_cache - -.PHONY : rebuild_cache/fast - # Special rule for the target edit_cache edit_cache: @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." @@ -78,6 +67,17 @@ edit_cache/fast: edit_cache .PHONY : edit_cache/fast +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /Applications/CLion.app/Contents/bin/cmake/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + # The main all target all: cmake_check_build_system $(CMAKE_COMMAND) -E cmake_progress_start /Users/jakeclose/Desktop/398/project/eecs398-search/CMakeFiles /Users/jakeclose/Desktop/398/project/eecs398-search/CMakeFiles/progress.marks @@ -111,30 +111,95 @@ depend: .PHONY : depend #============================================================================= -# Target rules for targets named url_test +# Target rules for targets named ParserEndToEndTest + +# Build rule for target. +ParserEndToEndTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 ParserEndToEndTest +.PHONY : ParserEndToEndTest + +# fast build rule for target. +ParserEndToEndTest/fast: + $(MAKE) -f CMakeFiles/ParserEndToEndTest.dir/build.make CMakeFiles/ParserEndToEndTest.dir/build +.PHONY : ParserEndToEndTest/fast + +#============================================================================= +# Target rules for targets named StemmerTest # Build rule for target. -url_test: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 url_test -.PHONY : url_test +StemmerTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 StemmerTest +.PHONY : StemmerTest # fast build rule for target. -url_test/fast: - $(MAKE) -f CMakeFiles/url_test.dir/build.make CMakeFiles/url_test.dir/build -.PHONY : url_test/fast +StemmerTest/fast: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/build +.PHONY : StemmerTest/fast #============================================================================= -# Target rules for targets named main +# Target rules for targets named StringProcessingTest # Build rule for target. -main: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 main -.PHONY : main +StringProcessingTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 StringProcessingTest +.PHONY : StringProcessingTest # fast build rule for target. -main/fast: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/build -.PHONY : main/fast +StringProcessingTest/fast: + $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/build +.PHONY : StringProcessingTest/fast + +#============================================================================= +# Target rules for targets named TokenizerTest + +# Build rule for target. +TokenizerTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 TokenizerTest +.PHONY : TokenizerTest + +# fast build rule for target. +TokenizerTest/fast: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/build +.PHONY : TokenizerTest/fast + +#============================================================================= +# Target rules for targets named test1 + +# Build rule for target. +test1: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 test1 +.PHONY : test1 + +# fast build rule for target. +test1/fast: + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/build +.PHONY : test1/fast + +#============================================================================= +# Target rules for targets named crawl + +# Build rule for target. +crawl: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 crawl +.PHONY : crawl + +# fast build rule for target. +crawl/fast: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/build +.PHONY : crawl/fast + +#============================================================================= +# Target rules for targets named search + +# Build rule for target. +search: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 search +.PHONY : search + +# fast build rule for target. +search/fast: + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/build +.PHONY : search/fast crawler/SocketReader.o: crawler/SocketReader.cpp.o @@ -142,7 +207,8 @@ crawler/SocketReader.o: crawler/SocketReader.cpp.o # target to build an object file crawler/SocketReader.cpp.o: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/SocketReader.cpp.o + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/SocketReader.cpp.o + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/SocketReader.cpp.o .PHONY : crawler/SocketReader.cpp.o crawler/SocketReader.i: crawler/SocketReader.cpp.i @@ -151,7 +217,8 @@ crawler/SocketReader.i: crawler/SocketReader.cpp.i # target to preprocess a source file crawler/SocketReader.cpp.i: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/SocketReader.cpp.i + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/SocketReader.cpp.i + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/SocketReader.cpp.i .PHONY : crawler/SocketReader.cpp.i crawler/SocketReader.s: crawler/SocketReader.cpp.s @@ -160,7 +227,8 @@ crawler/SocketReader.s: crawler/SocketReader.cpp.s # target to generate assembly for a file crawler/SocketReader.cpp.s: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/SocketReader.cpp.s + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/SocketReader.cpp.s + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/SocketReader.cpp.s .PHONY : crawler/SocketReader.cpp.s crawler/crawler.o: crawler/crawler.cpp.o @@ -169,7 +237,8 @@ crawler/crawler.o: crawler/crawler.cpp.o # target to build an object file crawler/crawler.cpp.o: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/crawler.cpp.o + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/crawler.cpp.o + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/crawler.cpp.o .PHONY : crawler/crawler.cpp.o crawler/crawler.i: crawler/crawler.cpp.i @@ -178,7 +247,8 @@ crawler/crawler.i: crawler/crawler.cpp.i # target to preprocess a source file crawler/crawler.cpp.i: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/crawler.cpp.i + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/crawler.cpp.i + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/crawler.cpp.i .PHONY : crawler/crawler.cpp.i crawler/crawler.s: crawler/crawler.cpp.s @@ -187,7 +257,8 @@ crawler/crawler.s: crawler/crawler.cpp.s # target to generate assembly for a file crawler/crawler.cpp.s: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/crawler.cpp.s + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/crawler.cpp.s + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/crawler.cpp.s .PHONY : crawler/crawler.cpp.s crawler/spider.o: crawler/spider.cpp.o @@ -196,7 +267,8 @@ crawler/spider.o: crawler/spider.cpp.o # target to build an object file crawler/spider.cpp.o: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/spider.cpp.o + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/spider.cpp.o + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/spider.cpp.o .PHONY : crawler/spider.cpp.o crawler/spider.i: crawler/spider.cpp.i @@ -205,7 +277,8 @@ crawler/spider.i: crawler/spider.cpp.i # target to preprocess a source file crawler/spider.cpp.i: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/spider.cpp.i + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/spider.cpp.i + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/spider.cpp.i .PHONY : crawler/spider.cpp.i crawler/spider.s: crawler/spider.cpp.s @@ -214,7 +287,8 @@ crawler/spider.s: crawler/spider.cpp.s # target to generate assembly for a file crawler/spider.cpp.s: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/crawler/spider.cpp.s + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/crawler/spider.cpp.s + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/crawler/spider.cpp.s .PHONY : crawler/spider.cpp.s main.o: main.cpp.o @@ -223,7 +297,9 @@ main.o: main.cpp.o # target to build an object file main.cpp.o: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/main.cpp.o + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/main.cpp.o + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/main.cpp.o + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/main.cpp.o .PHONY : main.cpp.o main.i: main.cpp.i @@ -232,7 +308,9 @@ main.i: main.cpp.i # target to preprocess a source file main.cpp.i: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/main.cpp.i + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/main.cpp.i + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/main.cpp.i + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/main.cpp.i .PHONY : main.cpp.i main.s: main.cpp.s @@ -241,35 +319,199 @@ main.s: main.cpp.s # target to generate assembly for a file main.cpp.s: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/main.cpp.s + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/main.cpp.s + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/main.cpp.s + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/main.cpp.s .PHONY : main.cpp.s -shared/urlTest.o: shared/urlTest.cpp.o +parser/Parser.o: parser/Parser.cpp.o + +.PHONY : parser/Parser.o + +# target to build an object file +parser/Parser.cpp.o: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/parser/Parser.cpp.o +.PHONY : parser/Parser.cpp.o + +parser/Parser.i: parser/Parser.cpp.i + +.PHONY : parser/Parser.i + +# target to preprocess a source file +parser/Parser.cpp.i: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/parser/Parser.cpp.i +.PHONY : parser/Parser.cpp.i + +parser/Parser.s: parser/Parser.cpp.s + +.PHONY : parser/Parser.s + +# target to generate assembly for a file +parser/Parser.cpp.s: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/parser/Parser.cpp.s +.PHONY : parser/Parser.cpp.s + +parser/tests/parserTest.o: parser/tests/parserTest.cpp.o + +.PHONY : parser/tests/parserTest.o + +# target to build an object file +parser/tests/parserTest.cpp.o: + $(MAKE) -f CMakeFiles/ParserEndToEndTest.dir/build.make CMakeFiles/ParserEndToEndTest.dir/parser/tests/parserTest.cpp.o +.PHONY : parser/tests/parserTest.cpp.o + +parser/tests/parserTest.i: parser/tests/parserTest.cpp.i + +.PHONY : parser/tests/parserTest.i + +# target to preprocess a source file +parser/tests/parserTest.cpp.i: + $(MAKE) -f CMakeFiles/ParserEndToEndTest.dir/build.make CMakeFiles/ParserEndToEndTest.dir/parser/tests/parserTest.cpp.i +.PHONY : parser/tests/parserTest.cpp.i + +parser/tests/parserTest.s: parser/tests/parserTest.cpp.s + +.PHONY : parser/tests/parserTest.s + +# target to generate assembly for a file +parser/tests/parserTest.cpp.s: + $(MAKE) -f CMakeFiles/ParserEndToEndTest.dir/build.make CMakeFiles/ParserEndToEndTest.dir/parser/tests/parserTest.cpp.s +.PHONY : parser/tests/parserTest.cpp.s + +shared/Document.o: shared/Document.cpp.o + +.PHONY : shared/Document.o + +# target to build an object file +shared/Document.cpp.o: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/shared/Document.cpp.o +.PHONY : shared/Document.cpp.o + +shared/Document.i: shared/Document.cpp.i + +.PHONY : shared/Document.i + +# target to preprocess a source file +shared/Document.cpp.i: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/shared/Document.cpp.i +.PHONY : shared/Document.cpp.i + +shared/Document.s: shared/Document.cpp.s + +.PHONY : shared/Document.s + +# target to generate assembly for a file +shared/Document.cpp.s: + $(MAKE) -f CMakeFiles/crawl.dir/build.make CMakeFiles/crawl.dir/shared/Document.cpp.s +.PHONY : shared/Document.cpp.s + +shared/ProducerConsumerQueue.o: shared/ProducerConsumerQueue.cpp.o -.PHONY : shared/urlTest.o +.PHONY : shared/ProducerConsumerQueue.o # target to build an object file -shared/urlTest.cpp.o: - $(MAKE) -f CMakeFiles/url_test.dir/build.make CMakeFiles/url_test.dir/shared/urlTest.cpp.o -.PHONY : shared/urlTest.cpp.o +shared/ProducerConsumerQueue.cpp.o: + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/shared/ProducerConsumerQueue.cpp.o +.PHONY : shared/ProducerConsumerQueue.cpp.o -shared/urlTest.i: shared/urlTest.cpp.i +shared/ProducerConsumerQueue.i: shared/ProducerConsumerQueue.cpp.i -.PHONY : shared/urlTest.i +.PHONY : shared/ProducerConsumerQueue.i # target to preprocess a source file -shared/urlTest.cpp.i: - $(MAKE) -f CMakeFiles/url_test.dir/build.make CMakeFiles/url_test.dir/shared/urlTest.cpp.i -.PHONY : shared/urlTest.cpp.i +shared/ProducerConsumerQueue.cpp.i: + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/shared/ProducerConsumerQueue.cpp.i +.PHONY : shared/ProducerConsumerQueue.cpp.i -shared/urlTest.s: shared/urlTest.cpp.s +shared/ProducerConsumerQueue.s: shared/ProducerConsumerQueue.cpp.s -.PHONY : shared/urlTest.s +.PHONY : shared/ProducerConsumerQueue.s # target to generate assembly for a file -shared/urlTest.cpp.s: - $(MAKE) -f CMakeFiles/url_test.dir/build.make CMakeFiles/url_test.dir/shared/urlTest.cpp.s -.PHONY : shared/urlTest.cpp.s +shared/ProducerConsumerQueue.cpp.s: + $(MAKE) -f CMakeFiles/search.dir/build.make CMakeFiles/search.dir/shared/ProducerConsumerQueue.cpp.s +.PHONY : shared/ProducerConsumerQueue.cpp.s + +util/tests/stemmerTest.o: util/tests/stemmerTest.cpp.o + +.PHONY : util/tests/stemmerTest.o + +# target to build an object file +util/tests/stemmerTest.cpp.o: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/tests/stemmerTest.cpp.o +.PHONY : util/tests/stemmerTest.cpp.o + +util/tests/stemmerTest.i: util/tests/stemmerTest.cpp.i + +.PHONY : util/tests/stemmerTest.i + +# target to preprocess a source file +util/tests/stemmerTest.cpp.i: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/tests/stemmerTest.cpp.i +.PHONY : util/tests/stemmerTest.cpp.i + +util/tests/stemmerTest.s: util/tests/stemmerTest.cpp.s + +.PHONY : util/tests/stemmerTest.s + +# target to generate assembly for a file +util/tests/stemmerTest.cpp.s: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/tests/stemmerTest.cpp.s +.PHONY : util/tests/stemmerTest.cpp.s + +util/tests/stringProcessingTest.o: util/tests/stringProcessingTest.cpp.o + +.PHONY : util/tests/stringProcessingTest.o + +# target to build an object file +util/tests/stringProcessingTest.cpp.o: + $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/tests/stringProcessingTest.cpp.o +.PHONY : util/tests/stringProcessingTest.cpp.o + +util/tests/stringProcessingTest.i: util/tests/stringProcessingTest.cpp.i + +.PHONY : util/tests/stringProcessingTest.i + +# target to preprocess a source file +util/tests/stringProcessingTest.cpp.i: + $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/tests/stringProcessingTest.cpp.i +.PHONY : util/tests/stringProcessingTest.cpp.i + +util/tests/stringProcessingTest.s: util/tests/stringProcessingTest.cpp.s + +.PHONY : util/tests/stringProcessingTest.s + +# target to generate assembly for a file +util/tests/stringProcessingTest.cpp.s: + $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/tests/stringProcessingTest.cpp.s +.PHONY : util/tests/stringProcessingTest.cpp.s + +util/tests/tokenizerTest.o: util/tests/tokenizerTest.cpp.o + +.PHONY : util/tests/tokenizerTest.o + +# target to build an object file +util/tests/tokenizerTest.cpp.o: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/tests/tokenizerTest.cpp.o +.PHONY : util/tests/tokenizerTest.cpp.o + +util/tests/tokenizerTest.i: util/tests/tokenizerTest.cpp.i + +.PHONY : util/tests/tokenizerTest.i + +# target to preprocess a source file +util/tests/tokenizerTest.cpp.i: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/tests/tokenizerTest.cpp.i +.PHONY : util/tests/tokenizerTest.cpp.i + +util/tests/tokenizerTest.s: util/tests/tokenizerTest.cpp.s + +.PHONY : util/tests/tokenizerTest.s + +# target to generate assembly for a file +util/tests/tokenizerTest.cpp.s: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/tests/tokenizerTest.cpp.s +.PHONY : util/tests/tokenizerTest.cpp.s util/util.o: util/util.cpp.o @@ -277,7 +519,7 @@ util/util.o: util/util.cpp.o # target to build an object file util/util.cpp.o: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/util/util.cpp.o + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/util/util.cpp.o .PHONY : util/util.cpp.o util/util.i: util/util.cpp.i @@ -286,7 +528,7 @@ util/util.i: util/util.cpp.i # target to preprocess a source file util/util.cpp.i: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/util/util.cpp.i + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/util/util.cpp.i .PHONY : util/util.cpp.i util/util.s: util/util.cpp.s @@ -295,7 +537,7 @@ util/util.s: util/util.cpp.s # target to generate assembly for a file util/util.cpp.s: - $(MAKE) -f CMakeFiles/main.dir/build.make CMakeFiles/main.dir/util/util.cpp.s + $(MAKE) -f CMakeFiles/test1.dir/build.make CMakeFiles/test1.dir/util/util.cpp.s .PHONY : util/util.cpp.s # Help Target @@ -304,10 +546,15 @@ help: @echo "... all (the default if no target is provided)" @echo "... clean" @echo "... depend" - @echo "... rebuild_cache" @echo "... edit_cache" - @echo "... url_test" - @echo "... main" + @echo "... ParserEndToEndTest" + @echo "... rebuild_cache" + @echo "... StemmerTest" + @echo "... StringProcessingTest" + @echo "... TokenizerTest" + @echo "... test1" + @echo "... crawl" + @echo "... search" @echo "... crawler/SocketReader.o" @echo "... crawler/SocketReader.i" @echo "... crawler/SocketReader.s" @@ -320,9 +567,27 @@ help: @echo "... main.o" @echo "... main.i" @echo "... main.s" - @echo "... shared/urlTest.o" - @echo "... shared/urlTest.i" - @echo "... shared/urlTest.s" + @echo "... parser/Parser.o" + @echo "... parser/Parser.i" + @echo "... parser/Parser.s" + @echo "... parser/tests/parserTest.o" + @echo "... parser/tests/parserTest.i" + @echo "... parser/tests/parserTest.s" + @echo "... shared/Document.o" + @echo "... shared/Document.i" + @echo "... shared/Document.s" + @echo "... shared/ProducerConsumerQueue.o" + @echo "... shared/ProducerConsumerQueue.i" + @echo "... shared/ProducerConsumerQueue.s" + @echo "... util/tests/stemmerTest.o" + @echo "... util/tests/stemmerTest.i" + @echo "... util/tests/stemmerTest.s" + @echo "... util/tests/stringProcessingTest.o" + @echo "... util/tests/stringProcessingTest.i" + @echo "... util/tests/stringProcessingTest.s" + @echo "... util/tests/tokenizerTest.o" + @echo "... util/tests/tokenizerTest.i" + @echo "... util/tests/tokenizerTest.s" @echo "... util/util.o" @echo "... util/util.i" @echo "... util/util.s" diff --git a/parser/Parser.cpp b/parser/Parser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e412e3637cb6e0d474a00b71c794aeaec2e748a7 --- /dev/null +++ b/parser/Parser.cpp @@ -0,0 +1,106 @@ +// +// Created by Jake Close on 3/5/18. +// + + + +#include "Parser.h" + + +/** + * Parses file + * @param inFile + * @return + */ +//TODO instead of grabbing each line, look to see if beginning of +// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found +void Parser::parse ( string html, Tokenizer *tokenizer ) + { + auto htmlIt = html.begin(); + int offset = 0; + while (htmlIt != html.end()) + { + // if open bracket + if ( *htmlIt == '<' ) + { + auto begCloseTag = findNext ("</", htmlIt); + auto endCloseTag = findNext ( ">", begCloseTag); + string line (htmlIt, endCloseTag + 1); + htmlIt = endCloseTag + 2; + + // check if line is url + string url = extract_url ( line ); + if (url != "") + { + urlFrontier->Push ( url ); + } + // check if line is title + else + { + string title = extract_title ( line ); + if (title != "") + { + tokenizer->execute ( title, offset ); + } + } + //TODO fix offset? + offset = htmlIt - html.begin(); + } + else + { + ++htmlIt; + } + } + + + } + +/** + * Returns a url, or "" if none + * @param word + * @return + */ +string Parser::extract_url ( string word ) + { + string url = ""; + if ( *findStr ( "<a", word ) != '\0' ) + { + auto foundHref = findStr ( "href", word ); + auto foundHttp = findNext ( "http", foundHref ); + if ( *foundHttp != '\0' ) + { + url = ""; + auto closeTag = findNext ( ">", word.begin ( ) ); + while ( *foundHttp != *closeTag ) + { + url += *foundHttp; + ++foundHttp; + } + } + } + + return url; + } + +/** + * Returns a title, or "" if none + * @param word + * @return + */ +string Parser::extract_title ( string & word ) + { + string title = ""; + char end = '<'; + auto pos = findStr ( "<title>", word ); + if ( *pos != '\0') + { + pos += 7; + while ( *pos != end ) + { + title += *pos; + ++pos; + } + } + return title; + } + diff --git a/parser/Parser.h b/parser/Parser.h index 5d1c1666ef71c3844125d8916f9348e8cdac30cb..17e5258dc9781c2fc728da3ebf91e56d6550788b 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -6,7 +6,7 @@ // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue // - +#pragma once #include <string> #include <functional> #include <queue> @@ -17,6 +17,7 @@ #include "../shared/Document.h" #include "../shared/ProducerConsumerQueue.h" + using namespace std; /** @@ -56,95 +57,24 @@ private: */ //TODO instead of grabbing each line, look to see if beginning of // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found - void parse ( string html, Tokenizer *tokenizer ) - { - auto htmlIt = html.begin(); - int offset = 0; - while (htmlIt != html.end()) - { - // if open bracket - if ( *htmlIt == '<' ) - { - auto begCloseTag = findNext ("</", htmlIt); - auto endCloseTag = findNext ( ">", begCloseTag); - string line (htmlIt, endCloseTag + 1); - htmlIt = endCloseTag + 2; - - // check if line is url - string url = extract_url ( line ); - if (url != "") - { - urlFrontier->Push ( url ); - } - // check if line is title - else - { - string title = extract_title ( line ); - if (title != "") - { - tokenizer->execute ( title, offset ); - } - } - //TODO fix offset? - offset = htmlIt - html.begin(); - } - else - { - ++htmlIt; - } - } + void parse ( string html, Tokenizer *tokenizer ); - } - /** * Returns a url, or "" if none * @param word * @return */ - string extract_url ( string word ) - { - string url = ""; - if ( *findStr ( "<a", word ) != '\0' ) - { - auto foundHref = findStr ( "href", word ); - auto foundHttp = findNext ( "http", foundHref ); - if ( *foundHttp != '\0' ) - { - url = ""; - auto closeTag = findNext ( ">", word.begin ( ) ); - while ( *foundHttp != *closeTag ) - { - url += *foundHttp; - ++foundHttp; - } - } - } - - return url; - } + string extract_url ( string word ); + /** * Returns a title, or "" if none * @param word * @return */ - string extract_title ( string & word ) - { - string title = ""; - char end = '<'; - auto pos = findStr ( "<title>", word ); - if ( *pos != '\0') - { - pos += 7; - while ( *pos != end ) - { - title += *pos; - ++pos; - } - } - return title; - } + string extract_title ( string & word ); + }; diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 07d973edc382a8bf4c0e5342422982ab2a7f4bdb..75c6cc8407b09d376c4152c896b48e620bd9907f 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -27,7 +27,10 @@ void testSimple ( ) { ProducerConsumerQueue < string > urlFrontierTest; - Document document ( "<title>This Cat Title Cat</title>" ); + ParsedUrl url = ParsedUrl("testurl.com"); + char docString[10240]; + strcpy(docString, "<title>This Cat Title Cat</title>"); + Document document ( url, docString); Parser parser ( &urlFrontierTest ); auto dictionary = parser.execute ( &document ); @@ -49,13 +52,15 @@ void testComplex ( ) ProducerConsumerQueue < string > urlFrontierTest; ifstream file("../tests/cats.html"); string temp; - string docString = "<title>Joe the Cat</title>\n"; - docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n"; + char docString[10240]; + strcpy(docString, "<title>Joe the Cat</title>\n"); + + strcat(docString, "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n"); while(std::getline(file, temp)) { - docString += temp; + //strcat(docString, str(temp)); } - - Document document ( docString ); + ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html"); + Document document ( url, docString ); Parser parser ( &urlFrontierTest ); auto dictionary = parser.execute ( &document ); diff --git a/shared/Document.cpp b/shared/Document.cpp new file mode 100644 index 0000000000000000000000000000000000000000..204dde3c45f027e10d494de0cd70034e4a7d9a36 --- /dev/null +++ b/shared/Document.cpp @@ -0,0 +1,101 @@ +// +// Created by Jake Close on 3/5/18. +// + + + +#include "Document.h" + +string Document::DocToString ( ) + { + return string ( docString, strlen ( docString ) ) + "\n"; + } + +int Document::WriteToDocMap ( ) + { + + pthread_mutex_lock ( &docMap_mutex ); + + //for now just write url + + string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP; + int file = util::getFileDescriptor ( loc.c_str ( ), "W" ); + off_t resultPosition = 0; + + try + { + //check if its available + if ( file == -1 ) + { + throw ( "error opening docMap" ); + } + else + { + //get the current size of the docMap + size_t seekPosition = util::FileSize ( file ); + //seek to the end of the file + resultPosition = lseek ( file, seekPosition, SEEK_SET ); + + if ( resultPosition == -1 ) + { + throw ( "Could not seek" ); + } + cout << "Current docMap position on disk" << endl; + cout << resultPosition << endl; + + size_t success = write ( file, this->DocToString ( ).c_str ( ), + strlen ( this->DocToString ( ).c_str ( ) ) ); + if ( success == -1 ) + { + throw ( "Error writing document object to document map" ); + } + } + } + catch ( const char *str ) + { + cerr << str << endl; + close ( file ); + pthread_mutex_unlock ( &docMap_mutex ); + return -1; + } + close ( file ); + pthread_mutex_unlock ( &docMap_mutex ); + return resultPosition; + } + + +void Document::PrintDocMap ( string url, int location ) + { + pthread_mutex_lock ( &docMap_mutex ); + + std::cout << url << " is " << location; + + string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP; + int file = util::getFileDescriptor ( loc.c_str ( ), "R" ); + + + //check if its available + if ( file ) + { + off_t resultPosition = lseek ( file, ( size_t ) location, SEEK_SET ); + int bytes = 14; + if ( bytes > 0 ) + { + char *buffer = new char[bytes]; + ssize_t bytesRead; + if ( bytesRead = read ( file, buffer, bytes ) ) + write ( 1, buffer, bytesRead ); + else + { + cerr << "Could not read " << bytes << " bytes at position " << + resultPosition << ", error = " << errno; + pthread_mutex_unlock ( &docMap_mutex ); + return; + } + } + + } + pthread_mutex_unlock ( &docMap_mutex ); + return; + +}; \ No newline at end of file diff --git a/shared/Document.h b/shared/Document.h index 5aca64dabde2f8c93aa5f6b49f85f531b8a0de34..acc5b6ace753bff14f607348026c4ccd0444c91a 100644 --- a/shared/Document.h +++ b/shared/Document.h @@ -8,6 +8,7 @@ #include <string> #include <vector> #include <pthread.h> +#include "../util/util.h" using namespace std; @@ -23,107 +24,26 @@ class Document { private: ParsedUrl url; + char *docString; long docID; bool lastCrawlStatus; int lastCrawlDate; - int lastCrawlPageCount; + int lastCrawlWordCount; //add more info fields here public: - Document ( string url_in ) : url ( ParsedUrl ( url_in ) ) - { } - - string DocToString ( ) - { - return string ( url.CompleteUrl, strlen ( url.CompleteUrl ) ) + "\n"; - } - - int WriteToDocMap ( ) - { - - pthread_mutex_lock ( &docMap_mutex ); - - //for now just write url - - string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP; - int file = util::getFileDescriptor ( loc.c_str ( ), "W" ); - off_t resultPosition = 0; - try - { - //check if its available - if ( file == -1 ) - { - throw ( "error opening docMap" ); - } - else - { - //get the current size of the docMap - size_t seekPosition = util::FileSize ( file ); - //seek to the end of the file - resultPosition = lseek ( file, seekPosition, SEEK_SET ); - - if ( resultPosition == -1 ) - { - throw ( "Could not seek" ); - } - cout << "Current docMap position on disk" << endl; - cout << resultPosition << endl; - - size_t success = write ( file, this->DocToString ( ).c_str ( ), - strlen ( this->DocToString ( ).c_str ( ) ) ); - if ( success == -1 ) - { - throw ( "Error writing document object to document map" ); - } - } - } - catch ( const char *str ) - { - cerr << str << endl; - close ( file ); - pthread_mutex_unlock ( &docMap_mutex ); - return -1; - } - close ( file ); - pthread_mutex_unlock ( &docMap_mutex ); - return resultPosition; - } - - - static void PrintDocMap ( string url, int location ) - { - pthread_mutex_lock ( &docMap_mutex ); + Document( ParsedUrl url_in ) : url((url_in)), docString( nullptr ) + { } - std::cout << url << " is " << location; + Document( ParsedUrl url_in, char *docStringIn ) : url((url_in)), docString( docStringIn ) + { } - string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP; - int file = util::getFileDescriptor ( loc.c_str ( ), "R" ); + string DocToString(); + int WriteToDocMap(); - //check if its available - if ( file ) - { - off_t resultPosition = lseek ( file, ( size_t ) location, SEEK_SET ); - int bytes = 14; - if ( bytes > 0 ) - { - char *buffer = new char[bytes]; - ssize_t bytesRead; - if ( bytesRead = read ( file, buffer, bytes ) ) - write ( 1, buffer, bytesRead ); - else - { - cerr << "Could not read " << bytes << " bytes at position " << - resultPosition << ", error = " << errno; - pthread_mutex_unlock ( &docMap_mutex ); - return; - } - } - } - pthread_mutex_unlock ( &docMap_mutex ); - return; - } + static void PrintDocMap( string url, int location ); }; \ No newline at end of file diff --git a/util/Stemmer.h b/util/Stemmer.h index 84e1990c0578ed00b4bc32ecff7c3d529f6a6607..4acbf35bb41d4b3ca1afe3cb056908152e71a553 100644 --- a/util/Stemmer.h +++ b/util/Stemmer.h @@ -2,9 +2,8 @@ // Created by Veronica Day on 2/22/18. // -#ifndef EECS398_SEARCH_STEMMER_H -#define EECS398_SEARCH_STEMMER_H +#pragma once class Stemmer { @@ -12,4 +11,3 @@ class Stemmer }; -#endif //EECS398_SEARCH_STEMMER_H diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 3de99f5bbde49ed7438b7831bc5f2f1516d034be..ba27e43cbc32eaa1daa350ad579e398bc512c1bb 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -1,7 +1,7 @@ // // Created by anvia on 1/31/2018. // - +#pragma once #include <string> #include <unordered_map> #include <vector> diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 40056e0f707173c184f474f0e699abb6cbda6b72..03c18a6a0c0d8500995cc8cf7a519c1ab69d77e8 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -2,9 +2,7 @@ // Created by anvia on 1/31/2018. // -#ifndef EECS398_SEARCH_STRINGPROCESSING_H -#define EECS398_SEARCH_STRINGPROCESSING_H - +#pragma once #include <string> #include <unordered_map> #include <vector> @@ -201,4 +199,3 @@ string stemWord(string word) return ""; } -#endif //EECS398_SEARCH_STRINGPROCESSING_H