LogDoctor/logdoctor/modules/craplog/modules/workers/impl/loglinedata.cpp

562 lines
22 KiB
C++

#include "../lib.h"
#include "modules/exceptions.h"
#include "modules/craplog/modules/lib.h"
#include "modules/craplog/modules/datetime.h"
#include "utilities/strings.h"
#include "utilities/vectors.h"
LogLineData::LogLineData(const std::string& line, const LogsFormat& logs_format)
{
using F = LogsFormatField;
size_t start, stop{logs_format.initial.size()},
sep_i{0};
const size_t line_size{ line.size()-1ul },
max_seps{ logs_format.separators.size() },
n_seps{ max_seps-1ul };
while ( sep_i <= max_seps ) {
// split fields
start = stop; // stop updated at the end of the loop
std::string_view sep;
if ( sep_i <= n_seps ) {
sep = logs_format.separators.at( sep_i );
stop = line.find( sep, start );
if ( stop == std::string::npos ) {
throw LogParserException( "Separator not found", std::string{sep} );
}
} else if ( sep_i == max_seps ) {
// final separator
sep = logs_format.final;
if ( sep.empty() ) {
stop = line_size+1ul;
} else {
stop = line.find( sep, start );
if ( stop == std::string::npos ) {
throw LogParserException( "Final separator not found", std::string{sep} );
}
}
} else [[unlikely]] {
// should be unreachable
throw ("Unexpected section reached");
}
const size_t sep_size{ sep.size() };
// get the field
const F fld{ logs_format.fields.at( sep_i ) };
if ( _DISCARDED | fld ) {
// only parse the considered fields
std::string fld_str{ line.substr(start, stop-start) };
if ( sep_i < n_seps ) {
// not the last separator, check for mistakes
size_t aux_stop = stop;
if ( (_MAY_HAVE_SPACES & fld) && sep == " " ) {
// check the fields with whitespace-separated values
const size_t n{ fld == request_full ? 2ul
: fld & _COUNT_SPACES };
size_t c{ StringOps::count( fld_str, ' ' ) };
if ( c < n ) {
// loop until the correct number of whitespaces is reached
size_t aux_start = line[stop+1ul] == ' ' ? stop : stop+1ul;
while ( c < n ) {
aux_stop = line.find( sep, aux_start );
if ( aux_stop == std::string::npos ) {
// not found
throw LogParserException( "Separator not found", std::string{sep} );
}
aux_start = aux_stop+1ul;
++c;
}
} else if ( c > n ) [[unlikely]] {
// should be unreachable
throw LogParserException( "Unexpected count for separator", std::string{sep} );
}
} else if ( sep.front() == '"' && fld == F::user_agent ) {
// atm the only support is for escaped quotes
if ( fld_str.back() == '\\' ) {
// the found separator is not actually the separator but is part of the user-agent string
// keep searching until the real separator is found
size_t aux_start = stop + sep_size;
while (true) {
aux_stop = line.find( sep, aux_start );
if ( aux_stop == std::string::npos ) {
// not found
throw LogParserException( "Separator not found", std::string{sep} );
} else if ( line.at( aux_stop-1ul ) != '\\' ) {
// non-backslashed quotes, real separator found (hopefully)
break;
}
aux_start = aux_stop + sep_size;
}
}
}
// finally update if needed
if ( aux_stop > stop ) {
stop = aux_stop;
fld_str = line.substr(start, stop-start);
}
}
if ( ! fld_str.empty() ) {
// process the field
if ( _NO_PARSE_NEEDED & fld ) {
// no need to process, append directly if non-empty
if ( fld == request_query && fld_str == "-" ) {
continue;
}
this->data( fld ) = FieldData( std::move(fld_str) );
} else {
// process the field
// process the date to get year, month, day, hour and minute
if ( _DATE_TIME & fld ) {
auto dt = DateTimeOps::processDateTime( fld_str, fld ); // cut away the "date_time_" part
if ( auto& year{ dt.at(0) }; !year.empty() ) {
// year
this->year = FieldData( std::move(year) );
}
if ( auto& month{ dt.at(1) }; !month.empty() ) {
// month
this->month = FieldData( std::move(month) );
}
if ( auto& day{ dt.at(2) }; !day.empty() ) {
// day
this->day = FieldData( std::move(day) );
}
if ( auto& hour{ dt.at(3) }; !hour.empty() ) {
// hour
this->hour = FieldData( std::move(hour) );
}
if ( auto& minute{ dt.at(4) }; !minute.empty() ) {
// minute
this->minute = FieldData( std::move(minute) );
}
if ( auto& second{ dt.at(5) }; !second.empty() ) {
// second
this->second = FieldData( std::move(second) );
}
// process the time taken to convert to milliseconds
} else if ( _TIME_TAKEN & fld ) {
float t{ std::stof( fld_str ) };
if ( fld == time_taken_us ) {
// from microseconds
t /= 1000.0f;
} else if ( fld & time_taken_s ) {
// from seconds or seconds.milliseconds
t *= 1000.0f;
}
this->time_taken = FieldData( std::to_string( static_cast<int>( t ) ) );
// process the request to get the protocol, method, resource and query
} else if ( fld == request_full ) {
// check whether the request string has the proper number of spaces
const size_t n_spaces{ StringOps::count( fld_str, ' ' ) };
if ( n_spaces == 0ul ) [[unlikely]] {
// no spaces
if ( VecOps::contains( this->valid_methods, fld_str ) ) {
this->method = FieldData( std::move(fld_str) );
} else if ( VecOps::contains( this->valid_protocols, fld_str ) ) {
this->protocol = FieldData( std::move(fld_str) );
} else {
this->storeUriQuery( std::move(fld_str) );
}
} else if ( n_spaces == 1ul ) [[unlikely]] {
// 1 field is missing
this->storeMalformedRequestOneSpace( std::move(fld_str) );
} else if ( n_spaces > 2ul ) [[unlikely]] {
// most likely a malicious attempt
if ( sep == " " ) {
// hard to say how to handle it properly
throw LogParserException( "Malformed request string", fld_str );
}
this->storeMalformedRequestMultiSpace( std::move(fld_str) );
} else [[likely]] {
// correct amount of spaces
const size_t aux_stop1{ fld_str.find( ' ' ) },
aux_start{ aux_stop1+1ul },
aux_stop2{ fld_str.find( ' ', aux_start ) };
if ( aux_stop1 > 0ul && aux_stop2 > aux_start ) [[likely]] {
std::string method{ fld_str.substr( 0ul, aux_stop1 ) };
std::string protocol{ fld_str.substr( aux_stop2+1ul ) };
if ( VecOps::contains( this->valid_methods, method )
&& VecOps::contains( this->valid_protocols, protocol ) ) [[likely]] {
this->method = FieldData( std::move(method) );
this->protocol = FieldData( std::move(protocol) );
this->storeUriQuery( fld_str.substr( aux_start, aux_stop2-aux_start ) );
} else [[unlikely]] {
this->storeMalformedRequestMultiSpace( std::move(fld_str) );
}
} else [[unlikely]] {
this->storeMalformedRequestMultiSpace( std::move(fld_str) );
}
}
// process the request to get uri and query
} else if ( fld == request_uri_query ) {
// search for the query
std::string uri, query;
const size_t aux_{ fld_str.find( '?' ) };
if ( aux_ != std::string::npos ) {
uri = fld_str.substr( 0ul, aux_ );
query = fld_str.substr( aux_+1ul );
} else {
// query not found
uri = fld_str;
}
if ( ! uri.empty() ) {
this->uri = FieldData( std::move(uri) );
}
if ( ! query.empty() ) {
this->query = FieldData( std::move(query) );
}
// something went wrong
} else {
// hmmm.. no...
throw LogParserException( "Unexpected LogFormatField", std::to_string(fld) );
}
}
}
}
// update the stop for the next start
stop += sep_size;
++sep_i;
if ( stop > line_size ) {
// this was the final separator
break;
}
}
}
void LogLineData::storeUriQuery(std::string&& str) noexcept
{
if ( ! str.empty() ) {
if ( const auto pos{ str.find( '?' ) }; pos != std::string::npos ) {
this->uri = FieldData( str.substr( 0ul, pos ) );
this->query = FieldData( str.substr( pos+1ul ) );
} else {
this->uri = FieldData( std::move(str) );
}
}
}
void LogLineData::storeMalformedRequestOneSpace(std::string&& str) noexcept
{
const size_t pos{ str.find( ' ' ) };
std::string field1{ str.substr( 0ul, pos ) },
field2{ str.substr( pos+1 ) };
const bool is_method1{ VecOps::contains( this->valid_methods, field1 ) },
is_method2{ VecOps::contains( this->valid_methods, field2 ) },
is_protocol1{ VecOps::contains( this->valid_protocols, field1 ) },
is_protocol2{ VecOps::contains( this->valid_protocols, field2 ) };
/**
*** P\M
*** 00 01 10 11
***
*** 00 U+ UM MU U+
***
*** 01 UP -- MP --
***
*** 10 PU PM -- --
***
*** 11 U+ -- -- --
**/
if ( is_method1 ) {
if ( is_method2 ) {
// uri = 12
this->storeUriQuery( StringOps::strip( str ) );
} else if ( is_protocol2 ) {
// method = 1 // protocol = 2
this->method = FieldData( std::move(field1) );
this->protocol = FieldData( std::move(field2) );
} else {
// method = 1 // uri = 2
this->method = FieldData( std::move(field1) );
this->storeUriQuery( std::move(field2) );
}
} else if ( is_method2 ) {
if ( is_protocol1 ) {
// protocol = 1 // method = 2
this->protocol = FieldData( std::move(field1) );
this->method = FieldData( std::move(field2) );
} else {
// uri = 1 // method = 2
this->storeUriQuery( std::move(field1) );
this->method = FieldData( std::move(field2) );
}
} else if ( is_protocol1 ) {
if ( is_protocol2 ) {
// uri = 12
this->storeUriQuery( StringOps::strip( str ) );
} else {
// protocol = 1 // uri = 2
this->protocol = FieldData( std::move(field1) );
this->storeUriQuery( std::move(field2) );
}
} else if ( is_protocol2 ) {
// uri = 1 // protocol = 2
this->storeUriQuery( std::move(field1) );
this->protocol = FieldData( std::move(field2) );
} else {
// uri = 12
this->storeUriQuery( StringOps::strip( str ) );
}
}
void LogLineData::storeMalformedRequestMultiSpace(std::string&& str) noexcept
{
const size_t pos1{ str.find( ' ' ) },
pos2{ str.rfind( ' ' ) };
std::string field1{ str.substr( 0ul, pos1 ) };
std::string field2{ StringOps::strip( str.substr( pos1+1ul, pos2-pos1-1ul ) ) };
std::string field3{ str.substr( pos2+1ul ) };
const bool is_method1{ VecOps::contains( this->valid_methods, field1 ) },
is_method2{ VecOps::contains( this->valid_methods, field2 ) },
is_method3{ VecOps::contains( this->valid_methods, field3 ) },
is_protocol1{ VecOps::contains( this->valid_protocols, field1 ) },
is_protocol2{ VecOps::contains( this->valid_protocols, field2 ) },
is_protocol3{ VecOps::contains( this->valid_protocols, field3 ) };
/**
*** P\M
*** 000 001 010 011 100 101 110 111
***
*** 000 +U+ +UM +U+ +U+ MU+ +U+ +U+ +U+
***
*** 001 +UP --- UMP --- MUP --- +UP ---
***
*** 010 +U+ UPM --- --- MPU +U+ --- ---
***
*** 011 +U+ --- --- --- MU+ --- --- ---
***
*** 100 PU+ PUM PMU PU+ --- --- --- ---
***
*** 101 +U+ --- +U+ --- --- --- --- ---
***
*** 110 +U+ +UM --- --- --- --- --- ---
***
*** 111 +U+ --- --- --- --- --- --- ---
**/
if ( is_method1 && is_method3 ) {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
} else if ( is_method1 && is_method2 ) {
if ( is_protocol3 ) {
// uri = 12 // protocol = 3
this->storeUriQuery( StringOps::strip( str.substr( 0ul, pos2 ) ) );
this->protocol = FieldData( std::move(field3) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_method2 && is_method3 ) {
if ( is_protocol1 ) {
// protocol = 1 // uri = 23
this->protocol = FieldData( std::move(field1) );
this->storeUriQuery( StringOps::strip( str.substr( pos1+1ul ) ) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_method1 ) {
if ( is_protocol2 && !is_protocol3 ) {
// method = 1 // protocol = 2 // uri = 3
this->method = FieldData( std::move(field1) );
this->protocol = FieldData( std::move(field2) );
this->storeUriQuery( std::move(field3) );
} else if ( is_protocol3 && !is_protocol2 ) {
// method = 1 // uri = 2 // protocol = 3
this->method = FieldData( std::move(field1) );
this->storeUriQuery( std::move(field2) );
this->protocol = FieldData( std::move(field3) );
} else {
// method = 1 // uri = 23
this->method = FieldData( std::move(field1) );
this->storeUriQuery( StringOps::strip( str.substr( pos1+1ul ) ) );
}
} else if ( is_method2 ) {
if ( is_protocol1 && !is_protocol3 ) {
// protocol = 1 // method = 2 // uri = 3
this->protocol = FieldData( std::move(field1) );
this->method = FieldData( std::move(field2) );
this->storeUriQuery( std::move(field3) );
} else if ( is_protocol3 && !is_protocol1 ) {
// uri = 1 // method = 2 // protocol = 3
this->storeUriQuery( std::move(field1) );
this->method = FieldData( std::move(field2) );
this->protocol = FieldData( std::move(field3) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_method3 ) {
if ( is_protocol1 && !is_protocol2 ) {
// protocol = 1 // uri = 2 // method = 3
this->protocol = FieldData( std::move(field1) );
this->storeUriQuery( std::move(field2) );
this->method = FieldData( std::move(field3) );
} else if ( is_protocol2 && !is_protocol1 ) {
// uri = 1 // protocol = 2 // method = 3
this->storeUriQuery( std::move(field1) );
this->protocol = FieldData( std::move(field2) );
this->method = FieldData( std::move(field3) );
} else {
// uri = 12 // method = 3
this->storeUriQuery( StringOps::strip( str.substr( 0ul, pos2 ) ) );
this->method = FieldData( std::move(field3) );
}
} else if ( is_protocol1 && is_protocol3 ) {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
} else if ( is_protocol1 && is_protocol2 ) {
if ( is_method3 ) {
// uri = 12 // method = 3
this->storeUriQuery( StringOps::strip( str.substr( 0ul, pos2 ) ) );
this->method = FieldData( std::move(field3) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_protocol2 && is_protocol3 ) {
if ( is_method1 ) {
// method = 1 // uri = 23
this->method = FieldData( std::move(field1) );
this->storeUriQuery( StringOps::strip( str.substr( pos1+1ul ) ) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_protocol1 ) {
if ( !is_method2 && !is_method3 ) {
// protocol = 1 // uri = 23
this->protocol = FieldData( std::move(field1) );
this->storeUriQuery( StringOps::strip( str.substr( pos1+1ul ) ) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else if ( is_protocol3 ) {
if ( !is_method2 && !is_method1 ) {
// uri = 12 // protocol = 3
this->storeUriQuery( StringOps::strip( str.substr( 0ul, pos2 ) ) );
this->protocol = FieldData( std::move(field3) );
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
} else {
// uri = 123
this->storeUriQuery( StringOps::strip( str ) );
}
}
size_t LogLineData::size() const noexcept
{
return this->year
+ this->month
+ this->day
+ this->hour
+ this->minute
+ this->second
+ this->protocol
+ this->method
+ this->uri
+ this->query
+ this->response_code
+ this->time_taken
+ this->bytes_sent
+ this->bytes_received
+ this->referrer
+ this->client
+ this->user_agent
+ this->cookie;
}
FieldData& LogLineData::data(const LogsFormatField id)
{
using F = LogsFormatField;
switch (id) {
case F::date_time_year:
return this->year;
case F::date_time_month:
return this->month;
case F::date_time_day:
return this->day;
case F::date_time_hour:
return this->hour;
case F::date_time_minute:
return this->minute;
case F::date_time_second:
return this->second;
case F::request_protocol:
return this->protocol;
case F::request_method:
return this->method;
case F::request_uri:
return this->uri;
case F::request_query:
return this->query;
case F::response_code:
return this->response_code;
case F::time_taken_s:
return this->time_taken;
case F::bytes_sent:
return this->bytes_sent;
case F::bytes_received:
return this->bytes_received;
case F::referer:
return this->referrer;
case F::client:
return this->client;
case F::user_agent:
return this->user_agent;
case F::cookie:
return this->cookie;
default:
throw LogParserException( "Unexpected LogFormatField", std::to_string(id) );
}
}