Skip to content
Snippets Groups Projects
Commit 06bbce9d authored by vcday's avatar vcday
Browse files

fixed offsets

parent 76bf3f83
Branches
No related tags found
No related merge requests found
......@@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp)
add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp)
#find_package(OpenSSL REQUIRED)
find_package(OpenSSL REQUIRED)
#target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
......@@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s )
front = temp + strlen(http_buff);
delete[] http_buff;
http_buff = temp;
}
return http_buff;
......
......@@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
{
auto htmlIt = html.begin( );
unsigned long offset = 0;
unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
// tokenize url
string host = "";
......@@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
path.assign( currentUrl.Path );
string url = host + "/" + path;
tokenizer->execute( url, offset, Tokenizer::URL );
offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
while ( htmlIt != html.end( ) )
{
......@@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
string title = extract_title( line );
if ( title != "" )
{
tokenizer->execute( title, offset, Tokenizer::TITLE );
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
}
offset = htmlIt - html.begin( );
}
else
{
......
......@@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
* @param offset
* @param decorator
*/
void Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
{
// split by symbols
if ( decorator == Tokenizer::URL )
......@@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '='};
tokenize( splitStr( originalText, split, true ), offset, decorator );
return tokenize( splitStr( originalText, split, true ), offset, decorator );
}
// split by spaces
else
{
tokenize( splitStr( originalText, ' ', true ), offset, decorator );
return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
}
}
......@@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
* @param offset
* @param decorator
*/
void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
{
string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i )
......@@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch
++offset;
}
}
return offset;
}
......@@ -40,7 +40,7 @@ public:
* @param offset
* @param decorator
*/
void execute ( string originalText, unsigned long offset, char decorator = '\0' );
unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
private:
......@@ -54,6 +54,6 @@ private:
* @param offset
* @param decorator
*/
void tokenize ( vector< string > splitText, unsigned long offset, char decorator );
unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment