Skip to content
Snippets Groups Projects
Commit 937302fb authored by jsclose's avatar jsclose
Browse files

working on converting urls to ParsedUrl instances, trying to fill buffer

parent 6f1b122b
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,7 @@
class SocketReader : public StreamReader
{
public:
SocketReader( string url_in ) : url( ParsedUrl( url_in ) ) { }
SocketReader( ParsedUrl url_in ) : url( url_in ) { }
virtual void fillBuffer();
void httpRequest();
void httpsRequest();
......
......@@ -35,9 +35,9 @@ void Spider::FuncToRun()
{
string currentUrl = getUrl( ); //get url from url frontier
string stringUrl = getUrl( ); //get url from url frontier
char *fileMap;
ParsedUrl currentUrl = ParsedUrl(stringUrl);
//url has not seen before or time since seen is past certain criteria
if ( shouldURLbeCrawled( currentUrl ))
{
......@@ -47,9 +47,9 @@ void Spider::FuncToRun()
StreamReader *reader = request( currentUrl );
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + string(url.Host, strlen(url.Host)) + ".txt";
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + currentUrl.Host + ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
//parser.parse(reader);
cond = true;
}
......@@ -76,7 +76,7 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
bool Spider::writeDocToDisk(string url)
bool Spider::writeDocToDisk(ParsedUrl url)
{
Document d(url);
int resultPosition = d.WriteToDocMap();
......@@ -84,7 +84,7 @@ bool Spider::writeDocToDisk(string url)
return false;
}
this->docMapLookup->insert( std::pair < string, int >( url, resultPosition ));
this->docMapLookup->insert( std::pair < string, int >( url.CompleteUrl, resultPosition ));
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
std::cout << it->first << " => " << it->second << '\n';
......@@ -93,10 +93,10 @@ bool Spider::writeDocToDisk(string url)
bool Spider::shouldURLbeCrawled( string url )
bool Spider::shouldURLbeCrawled( ParsedUrl url )
{
//search for url in doc cache
auto locationOnDisk = this->docMapLookup->find( url );
auto locationOnDisk = this->docMapLookup->find( url.CompleteUrl );
//bool protectedByRobots = checkRobots( url );
//if it doesnt find anything for that url key
......@@ -107,16 +107,15 @@ bool Spider::shouldURLbeCrawled( string url )
else
{
//Just for testing
Document::PrintDocMap(url, locationOnDisk->second);
Document::PrintDocMap(url.CompleteUrl, locationOnDisk->second);
}
return false;
}
//check if path in url is in the robots txt
bool Spider::checkRobots(string url_in)
bool Spider::checkRobots(ParsedUrl url)
{
ParsedUrl url = ParsedUrl(url_in);
string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt";
int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
//File does not exist yet
......@@ -163,14 +162,14 @@ returns true if fileMap was created, otherwise false
Modifies the filemap to be a char* of the file of the url passed
*/
StreamReader* Spider::request( string url )
StreamReader* Spider::request( ParsedUrl url )
{
string localFile;
StreamReader *newReader;
if ( this->mode == "local" )
{
newReader = new LocalReader( url );
newReader = new LocalReader( url.CompleteUrl );
}
else if ( this->mode == "web" )
{
......
......@@ -32,14 +32,14 @@ public:
//Makes request to given url
// if successful, writes file to disk, stores location to memeber value
// else return false and error information, retry if necessary
StreamReader *request( string url );
StreamReader *request( ParsedUrl url );
bool writeDocToDisk(string url);
bool writeDocToDisk(ParsedUrl url);
bool shouldURLbeCrawled( string URL );
bool shouldURLbeCrawled( ParsedUrl URL );
int getRobots(ParsedUrl url );
bool checkRobots(string url);
bool checkRobots(ParsedUrl url);
private:
......
t":{"value":4,"limit":500},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 237.721 1 -total"," 40.32% 95.841 1 Template:Reflist"," 33.35% 79.290 1 Template:Infobox_film_awards"," 30.80% 73.221 8 Template:Cite_web"," 26.88% 63.910 1 Template:Infobox"," 11.62% 27.620 1 Template:EngvarB"," 6.65% 15.797 2 Template:DMCA"," 6.32% 15.026 1 Template:BAFTA_Film_Awards_Chron"," 6.04% 14.360 2 Template:Dated_maintenance_category"," 5.18% 12.316 1 Template:Navbox"]},"scribunto":{"limitreport-timeusage":{"value":"0.100","limit":"10.000"},"limitreport-memusage":{"value":3947757,"limit":52428800}},"cachereport":{"origin":"mw1312","timestamp":"20180221214801","ttl":1900800,"transientcontent":false}}});});</script><script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":96,"wgHostname":"mw1330"});});</script>
</body>
</html>
/div>
</div>
</div>
</div>
<div id="footer" role="contentinfo">
<ul id="footer-info">
<li id="footer-info-lastmod"> This page was last edited on 21 February 2018, at 21:48.</li>
<li id="footer-info-copyright">Text is available under the <a rel="license" href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License">Creative Commons Attribution-ShareAlike License</a><a rel="license" href="//creativecommons.org/licenses/by-sa/3.0/" style="display:none;"></a>;
additional terms may apply. By using this site, you agree to the <a href="//wikimediafoundation.org/wiki/Terms_of_Use">Terms of Use</a> and <a href="//wikimediafoundation.org/wiki/Privacy_policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a href="//www.wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li>
</ul>
<ul id="footer-places">
<li id="footer-places-privacy"><a href="https://wikimediafoundation.org/wiki/Privacy_policy" class="extiw" title="wmf:Privacy policy">Privacy policy</a></li>
<li id="footer-places-about"><a href="/wiki/Wikipedia:About" title="Wikipedia:About">About Wikipedia</a></li>
<li id="footer-places-disclaimer"><a href="/wiki/Wikipedia:General_disclaimer" title="Wikipedia:General disclaimer">Disclaimers</a></li>
<li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li>
<li id="footer-places-developers"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute">Developers</a></li>
<li id="footer-places-cookiestatement"><a href="https://wikimediafoundation.org/wiki/Cookie_statement">Cookie statement</a></li>
<li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=71st_British_Academy_Film_Awards&amp;mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li>
</ul>
<ul id="footer-icons" class="noprint">
<li id="footer-copyrightico">
<a href="https://wikimediafoundation.org/"><img src="/static/images/wikimedia-button.png" srcset="/static/images/wikimedia-button-1.5x.png 1.5x, /static/images/wikimedia-button-2x.png 2x" width="88" height="31" alt="Wikimedia Foundation"/></a> </li>
<li id="footer-poweredbyico">
<a href="//www.mediawiki.org/"><img src="/static/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" srcset="/static/images/poweredby_mediawiki_132x47.png 1.5x, /static/images/poweredby_mediawiki_176x62.png 2x" width="88" height="31"/></a> </li>
</ul>
<div style="clear: both;"></div>
</div>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgPageParseReport":{"limitreport":{"cputime":"0.264","walltime":"0.328","ppvisitednodes":{"value":1822,"limit":1000000},"ppgeneratednodes":{"value":0,"limit":1500000},"postexpandincludesize":{"value":44633,"limit":2097152},"templateargumentsize":{"value":2117,"limit":2097152},"expansiondepth":{"value":12,"limit":40},"expensivefunctioncoun
\ No newline at end of file
<li>
<a href="https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree">NYT Store</a>
</li>
<li>
<a href="https://www.nytimes.com/times-journeys/?utm_source=nytimes&utm_medium=HPLink&utm_content=hp_browsetree&utm_campaign=NYT-HP">Times Journeys</a>
</li>
<li>
<a href="https://www.nytimes.com/seeallnav">Subscribe</a>
</li>
<li>
<a href="https://www.nytimes.com/membercenter">Manage My Account</a>
</li>
<li>
<a href="http://www.nytco.com">NYTCo</a>
</li>
</ul>
</div><!-- close column -->
<div class="column last-column">
<h3 class="menu-heading">Subscribe</h3>
<ul class="menu primary-menu">
<li class="menu-label">Subscribe</li>
<li class="home-delivery">
<i class="icon sprite-icon"></i>
<a class="nyt-home-delivery" href="https://www.nytimes.com/hdleftnav">Home Delivery</a>
</li>
<li class="digital-subscriptions">
<i class="icon sprite-icon"></i>
<a class="digital-subscription" href="https://www.nytimes.com/digitalleftnav">Digital Subscriptions</a>
</li>
<li class="nyt-crossword last-item">
<i class="icon sprite-icon"></i>
<a id="nyt-crossword" href="https://www.nytimes.com/crosswords/index.html">Crossword</a>
</li>
</ul>
<ul class="menu secondary-menu">
<li class="email-newsletters">
<a href="https://www.nytimes.com/marketing/newsletters">Email Newsletters</a>
</li>
<li>
<a href="https://myaccount.nytimes.com/mem/tnt.html">Alerts</a>
</li>
<li class="gift-subscription">
<a href="https://www.nytimes.com/giftleftnav">Gift Subscriptions</a>
</li>
<li>
<a href="https://www.nytimes.com/corporateleftnav">Group Subscriptions</a>
</li>
<li>
<a href="https://www.nytimes.com/educationleftnav">Education Rate</a>
</li>
</ul>
<ul class="menu secondary-menu">
<li>
<a href="https://www.nytimes.com/services/mobile/index.html">Mobile Applications</a>
</li>
<li>
<a href="http://eedition.nytimes.com/cgi-bin/signup.cgi?cc=37FYY">Replica Edition</a>
</li>
</ul>
</div><!-- close column -->
</div><!-- close split-6-layout -->
</nav><!-- close nav -->
</section><!-- close site-index -->
<footer id="page-footer" class="page-footer" role="contentinfo">
<nav>
<h2 class="visually-hidden">Site Information Navigation</h2>
<ul>
<li>
<a href="https://www.nytimes.com/content/help/rights/copyright/copyright-notice.html" itemprop="copyrightNotice">
&copy; <span itemprop="copyrightYear">2018</span><span itemprop="copyrightHolder provider sourceOrganization" itemscope itemtype="http://schema.org/Organization" itemid="http://www.nytimes.com"><span itemprop="name"> The New York Times Company</span><meta itemprop="tickerSymbol" content="NYSE NYT"/></span>
</a>
</li>
<li class="wide-viewport-item"><a href="https://www.nytimes.com/ref/membercenter/help/infoservdirectory.html">Contact Us</a></li>
<li class="wide-viewport-item"><a href="http://www.nytco.com/careers">Work With Us</a></li>
<li class="wide-viewport-item"><a href="http://nytmediakit.com/">Advertise</a></li>
<li class="wide-viewport-item"><a href="https://www.nytimes.com/content/help/rights/privacy/policy/privacy-policy.html#pp">Your Ad Choices</a></li>
<li><a href="https://www.nytimes.com/privacy">Privacy</a></li>
<li><a href="https://www.nytimes.com/ref/membercenter/help/agree.html" itemprop="usageTerms">Terms of Service</a></li>
<li class="wide-viewport-item last-item"><a href="https://www.nytimes.com/content/help/rights/sale/terms-of-sale.html">Terms of Sale</a></li>
</ul>
</nav>
<nav class="last-nav">
<h2 class="visually-hidden">Site Information Navigation</h2>
<ul>
<li><a href="http://spiderbites.nytimes.com">Site Map</a></li>
<li><a href="https://www.nytimes.com/membercenter/sitehelp.html">Help</a></li>
<li><a href="https://myaccount.nytimes.com/membercenter/feedback.html">Site Feedback</a></li>
<li class="wide-viewport-item last-item"><a href="https://www.nytimes.com/subscriptions/Multiproduct/lp5558.html?campaignId=37WXW">Subscriptions</a></li>
</ul>
</nav>
<div id="mobile-banner" class="mobile-banner hidden">
<a class="banner-message" href="https://mobile.nytimes.com/">View Mobile Version</a>
</div>
<div id="dfp-perf-test" class="ad hidden"></div>
</footer>
</div><!-- close page -->
</div><!-- close shell -->
<script>
require(['foundation/main'], function () {
require(['homepage/main']);
require(['jquery/nyt', 'foundation/views/page-manager'], function ($, pageManager) {
if (window.location.search.indexOf('disable_tagx') > 0) {
return;
}
$(document).ready(function () {
require(['https://a1.nyt.com/analytics/json-kidd.min.js'], function () {
pageManager.trackingFireEventQueue();
});
});
});
});
</script>
<!--esi
<esi:include src="/appconfig/https/show-modal.js" />
-->
<script>
require(['foundation/main'], function() {
require(['shared/audio/instances/audio']);
});
</script>
<div id="Inv1" class="ad inv1-ad hidden"></div>
<div id="Inv2" class="ad inv2-ad hidden"></div>
<div id="Inv3" class="ad inv3-ad hidden"></div>
<div id="ab1" class="ad ab1-ad hidden"></div>
<div id="ab2" class="ad ab2-ad hidden"></div>
<div id="ab3" class="ad ab3-ad hidden"></div>
<div id="prop1" class="ad prop1-ad hidden"></div>
<div id="prop2" class="ad prop2-ad hidden"></div>
<div id="Anchor" class="ad anchor-ad hidden"></div>
<div id="ADX_CLIENTSIDE" class="ad adx-clientside-ad hidden"></div>
</body>
</html>
</li>
<li>
<a href="https://www.nytimes.com/section/arts/design">Art & Design</a>
</li>
<li>
<a href="https://www.nytimes.com/section/books">Books</a>
</li>
<li>
<a href="https://www.nytimes.com/section/arts/dance">Dance</a>
</li>
<li>
<a href="https://www.nytimes.com/section/movies">Movies</a>
</li>
<li>
<a href="https://www.nytimes.com/section/arts/music">Music</a>
</li>
<li>
<a href="https://www.nytimes.com/events/">N.Y.C. Events Guide</a>
</li>
<li>
<a href="https://www.nytimes.com/section/arts/television">Television</a>
</li>
<li>
<a href="https://www.nytimes.com/section/theater">Theater</a>
</li>
<li>
<a href="https://www.nytimes.com/video/arts">Video: Arts</a>
</li>
</ul>
</div><!-- close column -->
\ No newline at end of file
No preview for this file type
......@@ -31,7 +31,7 @@ class Document
//add more info fields here
public:
Document(string url_in) : url(ParsedUrl(url_in)) {}
Document(ParsedUrl url_in) : url((url_in)) {}
string DocToString()
{
......
......@@ -106,6 +106,7 @@ public:
~ParsedUrl( )
{
pathBuffer = 0;
delete [ ] pathBuffer;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment