From 937302fb5d8e1e6704ab954d673faf463dd40731 Mon Sep 17 00:00:00 2001 From: jsclose <jsclose@umich.edu> Date: Wed, 21 Feb 2018 20:04:16 -0500 Subject: [PATCH] working on converting urls to ParsedUrl instances, trying to fill buffer --- crawler/SocketReader.h | 2 +- crawler/spider.cpp | 25 ++-- crawler/spider.h | 8 +- crawlerOutput/en.wikipedia.org.txt | 31 +++++ crawlerOutput/www.nytimes.com.txt | 214 +++++++++++++++++++++++++++++ docMap.txt | Bin 3423 -> 3941 bytes shared/documentMap.h | 2 +- shared/url.h | 1 + 8 files changed, 264 insertions(+), 19 deletions(-) create mode 100755 crawlerOutput/en.wikipedia.org.txt create mode 100755 crawlerOutput/www.nytimes.com.txt diff --git a/crawler/SocketReader.h b/crawler/SocketReader.h index 1a1bfdf..9c36a0e 100644 --- a/crawler/SocketReader.h +++ b/crawler/SocketReader.h @@ -11,7 +11,7 @@ class SocketReader : public StreamReader { public: - SocketReader( string url_in ) : url( ParsedUrl( url_in ) ) { } + SocketReader( ParsedUrl url_in ) : url( url_in ) { } virtual void fillBuffer(); void httpRequest(); void httpsRequest(); diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 0569993..3aee786 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -35,9 +35,9 @@ void Spider::FuncToRun() { - string currentUrl = getUrl( ); //get url from url frontier + string stringUrl = getUrl( ); //get url from url frontier char *fileMap; - + ParsedUrl currentUrl = ParsedUrl(stringUrl); //url has not seen before or time since seen is past certain criteria if ( shouldURLbeCrawled( currentUrl )) { @@ -47,9 +47,9 @@ void Spider::FuncToRun() StreamReader *reader = request( currentUrl ); - string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + string(url.Host, strlen(url.Host)) + ".txt"; + string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + currentUrl.Host + ".txt"; int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk); - + //parser.parse(reader); cond = true; } @@ -76,7 +76,7 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec -bool Spider::writeDocToDisk(string url) +bool Spider::writeDocToDisk(ParsedUrl url) { Document d(url); int resultPosition = d.WriteToDocMap(); @@ -84,7 +84,7 @@ bool Spider::writeDocToDisk(string url) return false; } - this->docMapLookup->insert( std::pair < string, int >( url, resultPosition )); + this->docMapLookup->insert( std::pair < string, int >( url.CompleteUrl, resultPosition )); for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it ) std::cout << it->first << " => " << it->second << '\n'; @@ -93,10 +93,10 @@ bool Spider::writeDocToDisk(string url) -bool Spider::shouldURLbeCrawled( string url ) +bool Spider::shouldURLbeCrawled( ParsedUrl url ) { //search for url in doc cache - auto locationOnDisk = this->docMapLookup->find( url ); + auto locationOnDisk = this->docMapLookup->find( url.CompleteUrl ); //bool protectedByRobots = checkRobots( url ); //if it doesnt find anything for that url key @@ -107,16 +107,15 @@ bool Spider::shouldURLbeCrawled( string url ) else { //Just for testing - Document::PrintDocMap(url, locationOnDisk->second); + Document::PrintDocMap(url.CompleteUrl, locationOnDisk->second); } return false; } //check if path in url is in the robots txt -bool Spider::checkRobots(string url_in) +bool Spider::checkRobots(ParsedUrl url) { - ParsedUrl url = ParsedUrl(url_in); string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt"; int robotsFileD = util::getFileDescriptor(pathToRobots , "R"); //File does not exist yet @@ -163,14 +162,14 @@ returns true if fileMap was created, otherwise false Modifies the filemap to be a char* of the file of the url passed */ -StreamReader* Spider::request( string url ) +StreamReader* Spider::request( ParsedUrl url ) { string localFile; StreamReader *newReader; if ( this->mode == "local" ) { - newReader = new LocalReader( url ); + newReader = new LocalReader( url.CompleteUrl ); } else if ( this->mode == "web" ) { diff --git a/crawler/spider.h b/crawler/spider.h index 4dbff98..c12e6e1 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -32,14 +32,14 @@ public: //Makes request to given url // if successful, writes file to disk, stores location to memeber value // else return false and error information, retry if necessary - StreamReader *request( string url ); + StreamReader *request( ParsedUrl url ); - bool writeDocToDisk(string url); + bool writeDocToDisk(ParsedUrl url); - bool shouldURLbeCrawled( string URL ); + bool shouldURLbeCrawled( ParsedUrl URL ); int getRobots(ParsedUrl url ); - bool checkRobots(string url); + bool checkRobots(ParsedUrl url); private: diff --git a/crawlerOutput/en.wikipedia.org.txt b/crawlerOutput/en.wikipedia.org.txt new file mode 100755 index 0000000..2bd2d09 --- /dev/null +++ b/crawlerOutput/en.wikipedia.org.txt @@ -0,0 +1,31 @@ +t":{"value":4,"limit":500},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 237.721 1 -total"," 40.32% 95.841 1 Template:Reflist"," 33.35% 79.290 1 Template:Infobox_film_awards"," 30.80% 73.221 8 Template:Cite_web"," 26.88% 63.910 1 Template:Infobox"," 11.62% 27.620 1 Template:EngvarB"," 6.65% 15.797 2 Template:DMCA"," 6.32% 15.026 1 Template:BAFTA_Film_Awards_Chron"," 6.04% 14.360 2 Template:Dated_maintenance_category"," 5.18% 12.316 1 Template:Navbox"]},"scribunto":{"limitreport-timeusage":{"value":"0.100","limit":"10.000"},"limitreport-memusage":{"value":3947757,"limit":52428800}},"cachereport":{"origin":"mw1312","timestamp":"20180221214801","ttl":1900800,"transientcontent":false}}});});</script><script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":96,"wgHostname":"mw1330"});});</script> + </body> +</html> +/div> + </div> + </div> + </div> + <div id="footer" role="contentinfo"> + <ul id="footer-info"> + <li id="footer-info-lastmod"> This page was last edited on 21 February 2018, at 21:48.</li> + <li id="footer-info-copyright">Text is available under the <a rel="license" href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License">Creative Commons Attribution-ShareAlike License</a><a rel="license" href="//creativecommons.org/licenses/by-sa/3.0/" style="display:none;"></a>; +additional terms may apply. By using this site, you agree to the <a href="//wikimediafoundation.org/wiki/Terms_of_Use">Terms of Use</a> and <a href="//wikimediafoundation.org/wiki/Privacy_policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a href="//www.wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> + </ul> + <ul id="footer-places"> + <li id="footer-places-privacy"><a href="https://wikimediafoundation.org/wiki/Privacy_policy" class="extiw" title="wmf:Privacy policy">Privacy policy</a></li> + <li id="footer-places-about"><a href="/wiki/Wikipedia:About" title="Wikipedia:About">About Wikipedia</a></li> + <li id="footer-places-disclaimer"><a href="/wiki/Wikipedia:General_disclaimer" title="Wikipedia:General disclaimer">Disclaimers</a></li> + <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> + <li id="footer-places-developers"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute">Developers</a></li> + <li id="footer-places-cookiestatement"><a href="https://wikimediafoundation.org/wiki/Cookie_statement">Cookie statement</a></li> + <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=71st_British_Academy_Film_Awards&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> + </ul> + <ul id="footer-icons" class="noprint"> + <li id="footer-copyrightico"> + <a href="https://wikimediafoundation.org/"><img src="/static/images/wikimedia-button.png" srcset="/static/images/wikimedia-button-1.5x.png 1.5x, /static/images/wikimedia-button-2x.png 2x" width="88" height="31" alt="Wikimedia Foundation"/></a> </li> + <li id="footer-poweredbyico"> + <a href="//www.mediawiki.org/"><img src="/static/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" srcset="/static/images/poweredby_mediawiki_132x47.png 1.5x, /static/images/poweredby_mediawiki_176x62.png 2x" width="88" height="31"/></a> </li> + </ul> + <div style="clear: both;"></div> + </div> + <script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgPageParseReport":{"limitreport":{"cputime":"0.264","walltime":"0.328","ppvisitednodes":{"value":1822,"limit":1000000},"ppgeneratednodes":{"value":0,"limit":1500000},"postexpandincludesize":{"value":44633,"limit":2097152},"templateargumentsize":{"value":2117,"limit":2097152},"expansiondepth":{"value":12,"limit":40},"expensivefunctioncoun \ No newline at end of file diff --git a/crawlerOutput/www.nytimes.com.txt b/crawlerOutput/www.nytimes.com.txt new file mode 100755 index 0000000..f13bf72 --- /dev/null +++ b/crawlerOutput/www.nytimes.com.txt @@ -0,0 +1,214 @@ + + <li> + <a href="https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage®ion=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree">NYT Store</a> + </li> + + + <li> + <a href="https://www.nytimes.com/times-journeys/?utm_source=nytimes&utm_medium=HPLink&utm_content=hp_browsetree&utm_campaign=NYT-HP">Times Journeys</a> + </li> + + + <li> + <a href="https://www.nytimes.com/seeallnav">Subscribe</a> + </li> + + + <li> + <a href="https://www.nytimes.com/membercenter">Manage My Account</a> + </li> + + + <li> + <a href="http://www.nytco.com">NYTCo</a> + </li> + + + </ul> + </div><!-- close column --> + + + <div class="column last-column"> + +<h3 class="menu-heading">Subscribe</h3> + +<ul class="menu primary-menu"> + <li class="menu-label">Subscribe</li> + <li class="home-delivery"> + <i class="icon sprite-icon"></i> + <a class="nyt-home-delivery" href="https://www.nytimes.com/hdleftnav">Home Delivery</a> + </li> + <li class="digital-subscriptions"> + <i class="icon sprite-icon"></i> + <a class="digital-subscription" href="https://www.nytimes.com/digitalleftnav">Digital Subscriptions</a> + </li> + <li class="nyt-crossword last-item"> + <i class="icon sprite-icon"></i> + <a id="nyt-crossword" href="https://www.nytimes.com/crosswords/index.html">Crossword</a> + </li> +</ul> + +<ul class="menu secondary-menu"> + + <li class="email-newsletters"> + <a href="https://www.nytimes.com/marketing/newsletters">Email Newsletters</a> + </li> + <li> + <a href="https://myaccount.nytimes.com/mem/tnt.html">Alerts</a> + </li> + <li class="gift-subscription"> + <a href="https://www.nytimes.com/giftleftnav">Gift Subscriptions</a> + </li> + <li> + <a href="https://www.nytimes.com/corporateleftnav">Group Subscriptions</a> + </li> + <li> + <a href="https://www.nytimes.com/educationleftnav">Education Rate</a> + </li> + +</ul> +<ul class="menu secondary-menu"> + <li> + <a href="https://www.nytimes.com/services/mobile/index.html">Mobile Applications</a> + </li> + <li> + <a href="http://eedition.nytimes.com/cgi-bin/signup.cgi?cc=37FYY">Replica Edition</a> + </li> + +</ul> + </div><!-- close column --> + + </div><!-- close split-6-layout --> + + </nav><!-- close nav --> + +</section><!-- close site-index --> + + <footer id="page-footer" class="page-footer" role="contentinfo"> + <nav> + <h2 class="visually-hidden">Site Information Navigation</h2> + <ul> + <li> + <a href="https://www.nytimes.com/content/help/rights/copyright/copyright-notice.html" itemprop="copyrightNotice"> + © <span itemprop="copyrightYear">2018</span><span itemprop="copyrightHolder provider sourceOrganization" itemscope itemtype="http://schema.org/Organization" itemid="http://www.nytimes.com"><span itemprop="name"> The New York Times Company</span><meta itemprop="tickerSymbol" content="NYSE NYT"/></span> + </a> + </li> + <li class="wide-viewport-item"><a href="https://www.nytimes.com/ref/membercenter/help/infoservdirectory.html">Contact Us</a></li> + <li class="wide-viewport-item"><a href="http://www.nytco.com/careers">Work With Us</a></li> + <li class="wide-viewport-item"><a href="http://nytmediakit.com/">Advertise</a></li> + <li class="wide-viewport-item"><a href="https://www.nytimes.com/content/help/rights/privacy/policy/privacy-policy.html#pp">Your Ad Choices</a></li> + <li><a href="https://www.nytimes.com/privacy">Privacy</a></li> + <li><a href="https://www.nytimes.com/ref/membercenter/help/agree.html" itemprop="usageTerms">Terms of Service</a></li> + <li class="wide-viewport-item last-item"><a href="https://www.nytimes.com/content/help/rights/sale/terms-of-sale.html">Terms of Sale</a></li> + </ul> + </nav> + <nav class="last-nav"> + <h2 class="visually-hidden">Site Information Navigation</h2> + <ul> + <li><a href="http://spiderbites.nytimes.com">Site Map</a></li> + <li><a href="https://www.nytimes.com/membercenter/sitehelp.html">Help</a></li> + <li><a href="https://myaccount.nytimes.com/membercenter/feedback.html">Site Feedback</a></li> + <li class="wide-viewport-item last-item"><a href="https://www.nytimes.com/subscriptions/Multiproduct/lp5558.html?campaignId=37WXW">Subscriptions</a></li> + </ul> + </nav> + + <div id="mobile-banner" class="mobile-banner hidden"> + <a class="banner-message" href="https://mobile.nytimes.com/">View Mobile Version</a> + </div> + + <div id="dfp-perf-test" class="ad hidden"></div> +</footer> + </div><!-- close page --> + </div><!-- close shell --> + <script> +require(['foundation/main'], function () { + require(['homepage/main']); + + + + + require(['jquery/nyt', 'foundation/views/page-manager'], function ($, pageManager) { + if (window.location.search.indexOf('disable_tagx') > 0) { + return; + } + $(document).ready(function () { + require(['https://a1.nyt.com/analytics/json-kidd.min.js'], function () { + pageManager.trackingFireEventQueue(); + }); + }); + }); +}); +</script> +<!--esi +<esi:include src="/appconfig/https/show-modal.js" /> +--> +<script> +require(['foundation/main'], function() { + require(['shared/audio/instances/audio']); +}); +</script> + + <div id="Inv1" class="ad inv1-ad hidden"></div> +<div id="Inv2" class="ad inv2-ad hidden"></div> +<div id="Inv3" class="ad inv3-ad hidden"></div> +<div id="ab1" class="ad ab1-ad hidden"></div> +<div id="ab2" class="ad ab2-ad hidden"></div> +<div id="ab3" class="ad ab3-ad hidden"></div> +<div id="prop1" class="ad prop1-ad hidden"></div> +<div id="prop2" class="ad prop2-ad hidden"></div> +<div id="Anchor" class="ad anchor-ad hidden"></div> +<div id="ADX_CLIENTSIDE" class="ad adx-clientside-ad hidden"></div> +</body> +</html> + </li> + + + <li> + <a href="https://www.nytimes.com/section/arts/design">Art & Design</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/books">Books</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/arts/dance">Dance</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/movies">Movies</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/arts/music">Music</a> + </li> + + + <li> + <a href="https://www.nytimes.com/events/">N.Y.C. Events Guide</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/arts/television">Television</a> + </li> + + + <li> + <a href="https://www.nytimes.com/section/theater">Theater</a> + </li> + + + <li> + <a href="https://www.nytimes.com/video/arts">Video: Arts</a> + </li> + + + </ul> + </div><!-- close column --> + + \ No newline at end of file diff --git a/docMap.txt b/docMap.txt index 4806d35acdf8ff1ace8ffd6197e89ca69b84fe24..4edf9573b7533211a17b773b66f471eee4990903 100644 GIT binary patch delta 35 hcmcaF^;B*{Jn!VSteul1d7LLVaN8r{kjej9Gy(p%4Qc=Y delta 7 OcmaDVcVB8lJTCwaoC6sE diff --git a/shared/documentMap.h b/shared/documentMap.h index 720854e..9ae1058 100644 --- a/shared/documentMap.h +++ b/shared/documentMap.h @@ -31,7 +31,7 @@ class Document //add more info fields here public: - Document(string url_in) : url(ParsedUrl(url_in)) {} + Document(ParsedUrl url_in) : url((url_in)) {} string DocToString() { diff --git a/shared/url.h b/shared/url.h index ed32a41..95c42e5 100644 --- a/shared/url.h +++ b/shared/url.h @@ -106,6 +106,7 @@ public: ~ParsedUrl( ) { + pathBuffer = 0; delete [ ] pathBuffer; } -- GitLab