Skip to content
Snippets Groups Projects
Commit 06bbce9d authored by vcday's avatar vcday
Browse files

fixed offsets

parent 76bf3f83
No related branches found
No related tags found
No related merge requests found
...@@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp) ...@@ -55,6 +55,6 @@ add_executable(search-engine search.cpp query/Query.cpp)
add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp) add_executable(ISRWord-tests constraintSolver/tests/ISRWordTests.cpp)
#find_package(OpenSSL REQUIRED) find_package(OpenSSL REQUIRED)
#target_link_libraries(crawler-parser-test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
...@@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s ) ...@@ -48,6 +48,7 @@ char * GetArbitrarySizeBuffer(int s )
front = temp + strlen(http_buff); front = temp + strlen(http_buff);
delete[] http_buff; delete[] http_buff;
http_buff = temp; http_buff = temp;
} }
return http_buff; return http_buff;
......
...@@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -32,7 +32,8 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
{ {
auto htmlIt = html.begin( ); auto htmlIt = html.begin( );
unsigned long offset = 0; unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
// tokenize url // tokenize url
string host = ""; string host = "";
...@@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -41,7 +42,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
path.assign( currentUrl.Path ); path.assign( currentUrl.Path );
string url = host + "/" + path; string url = host + "/" + path;
tokenizer->execute( url, offset, Tokenizer::URL ); offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
while ( htmlIt != html.end( ) ) while ( htmlIt != html.end( ) )
{ {
...@@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -72,10 +73,9 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
string title = extract_title( line ); string title = extract_title( line );
if ( title != "" ) if ( title != "" )
{ {
tokenizer->execute( title, offset, Tokenizer::TITLE ); offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
} }
} }
offset = htmlIt - html.begin( );
} }
else else
{ {
......
...@@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const ...@@ -28,7 +28,7 @@ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const
* @param offset * @param offset
* @param decorator * @param decorator
*/ */
void Tokenizer::execute ( string originalText, unsigned long offset, char decorator ) unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator )
{ {
// split by symbols // split by symbols
if ( decorator == Tokenizer::URL ) if ( decorator == Tokenizer::URL )
...@@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora ...@@ -36,12 +36,12 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '='}; '(', ')', '*', '+', ',', ';', '='};
tokenize( splitStr( originalText, split, true ), offset, decorator ); return tokenize( splitStr( originalText, split, true ), offset, decorator );
} }
// split by spaces // split by spaces
else else
{ {
tokenize( splitStr( originalText, ' ', true ), offset, decorator ); return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
} }
} }
...@@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora ...@@ -52,7 +52,7 @@ void Tokenizer::execute ( string originalText, unsigned long offset, char decora
* @param offset * @param offset
* @param decorator * @param decorator
*/ */
void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator ) unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator )
{ {
string processedString = ""; string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i ) for ( int i = 0; i < splitText.size( ); ++i )
...@@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch ...@@ -73,5 +73,6 @@ void Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, ch
++offset; ++offset;
} }
} }
return offset;
} }
...@@ -40,7 +40,7 @@ public: ...@@ -40,7 +40,7 @@ public:
* @param offset * @param offset
* @param decorator * @param decorator
*/ */
void execute ( string originalText, unsigned long offset, char decorator = '\0' ); unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
private: private:
...@@ -54,6 +54,6 @@ private: ...@@ -54,6 +54,6 @@ private:
* @param offset * @param offset
* @param decorator * @param decorator
*/ */
void tokenize ( vector< string > splitText, unsigned long offset, char decorator ); unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
}; };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment