Skip to content
Snippets Groups Projects
Commit 3d5f1a5e authored by jsclose's avatar jsclose
Browse files

stable crawler

parent 8eaaf7c5
Branches
No related tags found
No related merge requests found
No preview for this file type
......@@ -86,20 +86,24 @@ void Spider::run ( )
if ( shouldURLbeCrawled( docID ) )
{
StreamReader *reader = SR_factory( currentUrl, this->mode );
bool success = reader->request( );
if ( success )
if(reader)
{
cout << "Parsing " << currentUrl.CompleteUrl;
DocIndex *dict = parser.execute( reader );
IndexerQueue->Push( dict );
printDocIndex(dict);
reader->closeReader( );
//delete dict;
cond++;
bool success = reader->request( );
if ( success )
{
cout << "Parsing " << currentUrl.CompleteUrl;
DocIndex *dict = parser.execute( reader );
IndexerQueue->Push( dict );
printDocIndex(dict);
reader->closeReader( );
//delete dict;
cond++;
}
}
delete reader;
......
......@@ -58,9 +58,10 @@ int main ( int argc, char *argv[] )
*/
SSL_library_init( );
ParsedUrl url = ParsedUrl("http://www.boston.com/cars/specials/herb_chambers_cjd/ram_millbury.htmlhttp://www.jimmyfund.org/ways-to-give/corporate-engagement/cause-marketing/participating-companies/herb-chambers-automotive-family/");
string url1 = "https://fivethirtyeight.com/features/fear-not-readers-we-have-rss-feeds/";
string url2 = "https:";
ParsedUrl url = ParsedUrl(url2);
urlFrontier->Push(url);
indexer.StartThread( );
Crawler crawler( mode, urlFrontier, IndexerQueue );
......
http://www.boston.com/cars/specials/herb_chambers_cjd/ram_millbury.htmlhttp://www.jimmyfund.org/ways-to-give/corporate-engagement/cause-marketing/participating-companies/herb-chambers-automotive-family/
\ No newline at end of file
https://fivethirtyeight.com/features/fear-not-readers-we-have-rss-feeds/
http://http://googlereader.blogspot.com/2013/07/a-final-farewell.html
\ No newline at end of file
No preview for this file type
......@@ -74,14 +74,20 @@ public:
//char * domainBuffer = new char[ 20 ];
//get the domain:
char *i = Host;
for ( ; *i; i++ )
Domain = null;
if(i)
{
for ( ; *i; i++ )
{
if ( *i == Period )
Domain = i;
if ( *i == Period )
Domain = i;
}
}
// Whatever remains is the Path. // need to remove fragments
Path = p;
......@@ -115,24 +121,29 @@ public:
{
double lengthOfUrl = strlen(CompleteUrl);
Score += 4 * 1/ log( lengthOfUrl );
if(this->Domain != nullptr)
if(lengthOfUrl > 4)
{
if(this->Domain )
{
if ( strcmp ( Domain , ORG ) )
Score += 5;
else if ( strcmp ( Domain , EDU ) )
Score += 4;
else if ( strcmp ( Domain , GOV ) )
Score += 3;
else if ( strcmp ( Domain , COM ) )
Score += 2;
else if ( strcmp ( Domain , NET ) )
Score += 1;
else if ( strcmp ( Domain , INT ) )
Score += 1;
else if ( strcmp ( Domain , MIL ) )
Score += .5;
}
if ( strcmp ( Domain , ORG ) )
Score += 5;
else if ( strcmp ( Domain , EDU ) )
Score += 4;
else if ( strcmp ( Domain , GOV ) )
Score += 3;
else if ( strcmp ( Domain , COM ) )
Score += 2;
else if ( strcmp ( Domain , NET ) )
Score += 1;
else if ( strcmp ( Domain , INT ) )
Score += 1;
else if ( strcmp ( Domain , MIL ) )
Score += .5;
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment