Skip to content
Snippets Groups Projects
Commit 6edd284d authored by jsclose's avatar jsclose
Browse files

working on reading

parent 24f81e58
Branches
No related tags found
No related merge requests found
.idea/*
crawlerOutput/*
openssl-1.1.0g/*
.vagrant/*
CMakeLists.txt
......
......@@ -29,6 +29,33 @@ char * GetArbitrarySizeBuffer(SSL* ssl)
}
char * GetArbitrarySizeBuffer(int s )
{
int buf_size = 10240;
int current_size = buf_size;
char* ssl_buffer = new char[buf_size];
char* front = ssl_buffer;
int bytes;
while ( ( bytes = recv( s, front, buf_size, 0 ) ) > 0 )
{
current_size += buf_size;
char *temp = new char[current_size];
strcpy(temp, ssl_buffer);
front = temp + strlen(ssl_buffer);
delete[] ssl_buffer;
ssl_buffer = temp;
}
return ssl_buffer;
}
void SocketReader::httpRequest()
{
int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP );
......@@ -66,13 +93,15 @@ void SocketReader::httpRequest()
// Read from the socket until there's no more data.
char buffer[ 10240 ];
char HTTPbuffer[ 10240 ];
int bytes;
while ( ( bytes = recv( s, buffer, sizeof( buffer ), 0 ) ) > 0 )
write( 1, buffer, bytes );
buffer = GetArbitrarySizeBuffer(s);
close( s );
return;
}
......
......@@ -18,6 +18,17 @@
#include "SocketReader.h"
#include "../shared/documentMap.h"
size_t Spider::hash(const char * s){
{
// http://www.cse.yorku.ca/~oz/hash.html
size_t h = 5381;
int c;
while ((c = *s++))
h = ((h << 5) + h) + c;
return h;
}
}
string Spider::getUrl()
{
......@@ -47,7 +58,8 @@ void Spider::FuncToRun()
StreamReader *reader = request( currentUrl );
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + currentUrl.Host + ".txt";
size_t docID = hash(currentUrl.CompleteUrl);
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + to_string(docID)+ ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
//parser.parse(reader);
......
......@@ -37,7 +37,7 @@ public:
bool writeDocToDisk(ParsedUrl url);
bool shouldURLbeCrawled( ParsedUrl URL );
size_t hash(const char * s);
int getRobots(ParsedUrl url );
bool checkRobots(ParsedUrl url);
......
This diff is collapsed.
This diff is collapsed.
No preview for this file type
https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards
https://www.nytimes.com/
http://www.dailymail.co.uk/ushome/index.html
http://www.bbc.com/
http://www.bbc.co.uk/news/business-42959138
http://umich.edu
https://en.wikipedia.org/wiki/North_Ronaldsay_sheep
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment