Version upgrade 4.00 #45

Merged
elB4RTO merged 113 commits from devel into main 2024-02-17 16:13:26 +01:00
2 changed files with 222 additions and 378 deletions
Showing only changes of commit 837ef418b4 - Show all commits

View file

@ -7,12 +7,12 @@
#include "utilities/gzip.h"
#include "utilities/io.h"
#include "utilities/strings.h"
#include "utilities/vectors.h"
#include "modules/dialogs.h"
#include "modules/exceptions.h"
#include "modules/craplog/craplog.h"
#include "modules/craplog/modules/datetime.h"
#include "modules/craplog/modules/workers/lib.h"
#include <QSqlDatabase>
#include <QSqlQuery>
@ -20,22 +20,49 @@
CraplogParser::CraplogParser( const unsigned web_server_id, const unsigned dialogs_level, const std::string& db_data_path, const std::string& db_hashes_path, const LogsFormat& logs_format, const bw_lists_t& blacklists, const bw_lists_t& warnlists, const worker_files_t& log_files, QObject* parent )
: CraplogParserInterface { web_server_id, dialogs_level, db_data_path, db_hashes_path, logs_format, blacklists, warnlists, log_files, parent }
: QObject { parent }
, wsID { web_server_id }
, dialogs_level { dialogs_level }
, db_data_path { db_data_path }
, db_hashes_path { db_hashes_path }
, blacklists { blacklists }
, warnlists { warnlists }
, logs_format { logs_format }
, files_to_use { log_files }
{
}
void CraplogParser::sendPerfData()
{
emit this->perfData(
this->parsed_size,
this->parsed_lines
);
}
void CraplogParser::sendChartData()
{
emit this->chartData(
this->total_size,
this->total_lines,
this->warnlisted_size,
this->blacklisted_size
);
}
void CraplogParser::work()
{
this->proceed |= true;
this->db_edited &= false;
try {
if ( this->proceed ) {
if ( this->proceed ) [[likely]] {
// collect log lines
this->joinLogLines();
}
if ( this->proceed ) {
if ( this->proceed ) [[likely]] {
// parse the log lines to fill the collection
emit this->startedParsing();
this->parseLogLines();
@ -44,7 +71,7 @@ void CraplogParser::work()
// clear log lines data
this->logs_lines.clear();
if ( this->proceed && this->parsed_size > 0ul ) {
if ( this->proceed && !this->data_collection.empty() ) [[likely]] {
// store the new data
this->storeLogLines();
this->db_edited |= this->proceed;
@ -82,7 +109,7 @@ void CraplogParser::joinLogLines()
std::vector<std::string> aux;
aux.reserve( lines.size() );
for ( const std::string& line : lines ) {
if ( line.front() != '#' ) {
if ( line.front() != '#' ) [[likely]] {
// not a commented line
aux.push_back( line );
}
@ -152,7 +179,7 @@ void CraplogParser::joinLogLines()
).toStdString() );
}
// append to the relative list
// append to the list
this->logs_lines.insert( this->logs_lines.end(), content.begin(), content.end() );
}
if ( this->logs_lines.empty() ) {
@ -163,263 +190,10 @@ void CraplogParser::joinLogLines()
void CraplogParser::parseLogLines()
{
const auto parseLine = [this]( const std::string& line ) {
log_line_data_t data;
std::string_view sep;
std::string fld_str;
bool add_pm{false}, ok{true};
size_t start, stop{this->logs_format.initial.size()},
sep_i{0};
const size_t line_size{ line.size()-1ul },
n_seps{ this->logs_format.separators.size()-1ul };
while (true) {
// split fields
start = stop; // stop updated at the end of the loop
if ( sep_i <= n_seps ) {
sep = this->logs_format.separators.at( sep_i );
stop = line.find( sep, start );
} else if ( sep_i == n_seps+1ul ) {
// final separator
sep = this->logs_format.final;
if ( sep.empty() ) {
stop = line_size+1ul;
} else {
stop = line.find( sep, start );
if ( stop == std::string::npos ) {
stop = line_size+1ul;
}
}
} else {
// no more separators
break;
}
if ( stop == std::string::npos ) {
// separator not found, abort
throw LogParserException( "Separator not found", std::string{sep} );
}
const size_t sep_size = sep.size();
// get the field
const std::string& fld = this->logs_format.fields.at( sep_i );
if ( fld != "NONE" ) {
// only parse the considered fields
fld_str = StringOps::strip( line.substr(start, stop-start), ' ' );
if ( sep_i+1ul <= n_seps ) {
// not the last separator, check for mistakes
ok |= true;
size_t aux_stop = stop;
if ( sep == " " ) {
// whitespace-separated-values fields
size_t c{ static_cast<size_t>( std::count( fld_str.cbegin(), fld_str.cend(), ' ' ) ) },
n{ 0 };
if ( fld == "request_full" ) {
n += 2ul;
} else if ( fld == "date_time_mcs" ) {
n += 4ul;
} else if ( fld == "date_time_ncsa" ) {
n += 1ul;
} else if ( fld == "date_time_gmt" ) {
n += 3ul;
}
if ( n > 0ul && c < n ) {
// loop until the correct number of whitespaces is reached
size_t aux_start = stop+1ul;
while ( c < n ) {
aux_stop = line.find( sep, aux_start );
if ( aux_stop == std::string::npos ) {
// not found
ok &= false;
break;
}
aux_start = aux_stop+1ul;
c++;
}
}
} else if ( sep.front() == '"' && fld == "user_agent" ) {
// atm the only support is for escaped quotes
if ( fld_str.back() == '\\' ) {
size_t aux_start = stop + sep_size;
while (true) {
aux_stop = line.find( sep, aux_start );
if ( aux_stop == std::string::npos ) {
// not found
break;
} else if ( line.at( aux_stop-1ul ) != '\\' ) {
// non-backslashed quotes
break;
}
aux_start = aux_stop + sep_size;
}
}
}
// finally update if needed
if ( ok && aux_stop >= stop ) {
stop = aux_stop;
fld_str = StringOps::strip( line.substr(start, stop-start), ' ' );
}
}
if ( ! fld_str.empty() ) {
// process the field
const int& fld_id{ this->field2id.at(fld) };
if ( fld_id > 0 ) {
// no need to process, append directly if non-empty
if ( fld_id == 13 && fld_str == "-" ) {
continue;
}
data.emplace( fld_id, fld_str );
} else {
// process the field
// process the date to get year, month, day, hour and minute
if ( fld.rfind("date_time",0ul) == 0ul ) {
const auto dt = DateTimeOps::processDateTime( fld_str, fld.substr( 10 ) ); // cut away the "date_time_" part
if ( ! dt.at( 0 ).empty() ) {
// year
data.emplace( this->field2id.at("date_time_year"), dt.at( 0 ) );
}
if ( ! dt.at( 1 ).empty() ) {
// month
data.emplace( this->field2id.at("date_time_month"), dt.at( 1 ) );
}
if ( ! dt.at( 2 ).empty() ) {
// day
data.emplace( this->field2id.at("date_time_day"), dt.at( 2 ) );
}
if ( ! dt.at( 3 ).empty() ) {
// hour
if ( dt.at( 3 ) == "PM" ) {
add_pm |= true;
} else {
data.emplace( this->field2id.at("date_time_hour"), dt.at( 3 ) );
}
}
if ( ! dt.at( 4 ).empty() ) {
// minute
data.emplace( this->field2id.at("date_time_minute"), dt.at( 4 ) );
}
if ( ! dt.at( 5 ).empty() ) {
// second
data.emplace( this->field2id.at("date_time_second"), dt.at( 5 ) );
}
// process the request to get the protocol, method, resource and query
} else if ( fld == "request_full" ) {
size_t aux;
std::string protocol, method, uri, query,
aux_fld{ fld_str };
// method
aux = aux_fld.find( ' ' );
if ( aux != std::string::npos ) {
method = aux_fld.substr( 0ul, aux );
aux_fld = StringOps::lstrip( aux_fld.substr( aux+1ul ) );
// page & query
aux = aux_fld.find( ' ' );
if ( aux != std::string::npos ) {
const std::string aux_str{ aux_fld.substr( 0ul, aux ) };
// search for the query
const size_t aux_{ aux_str.find( '?' ) };
if ( aux_ != std::string::npos ) {
uri = aux_str.substr( 0ul, aux_ );
query = aux_str.substr( aux_+1ul );
} else {
// query not found
uri = aux_str;
}
// protocol
protocol = StringOps::lstrip( aux_fld.substr( aux+1ul ) );
}
}
// append non-empty data
if ( ! protocol.empty() ) {
data.emplace( this->field2id.at("request_protocol"), protocol );
}
if ( ! method.empty() ) {
data.emplace( this->field2id.at("request_method"), method );
}
if ( ! uri.empty() ) {
data.emplace( this->field2id.at("request_uri"), uri );
}
if ( ! query.empty() ) {
data.emplace( this->field2id.at("request_query"), query );
}
// process the request to get uri and query
} else if ( fld == "request_uri_query" ) {
// search for the query
std::string uri, query;
const size_t aux_{ fld_str.find( '?' ) };
if ( aux_ != std::string::npos ) {
uri = fld_str.substr( 0ul, aux_ );
query = fld_str.substr( aux_+1ul );
} else {
// query not found
uri = fld_str;
}
if ( ! uri.empty() ) {
data.emplace( this->field2id.at("request_uri"), uri );
}
if ( ! query.empty() ) {
data.emplace( this->field2id.at("request_query"), query );
}
// process the time taken to convert to milliseconds
} else if ( fld.rfind("time_taken_",0ul) == 0ul ) {
float t{ std::stof( fld_str ) };
const std::string u{ fld.substr( 11ul ) };
if ( u == "us" ) {
// from microseconds
t /= 1000.0f;
} else if ( u == "s" || u == "s.ms" ) {
// from seconds
t *= 1000.0f;
}
data.emplace( this->field2id.at("time_taken"), std::to_string( static_cast<int>( t ) ) );
// something went wrong
} else {
// hmmm.. no...
throw LogParserException( "Unexpected LogField", fld );
}
}
}
}
// update the stop for the next start
stop += sep_size;
sep_i++;
if ( stop > line_size ) {
// this was the final separator
break;
}
}
if ( add_pm ) {
try {
// add +12 hours for PM
data.at( 4 ) = std::to_string( 12 + std::stoi(data.at( 4 )) );
} catch (...) {
// no hour data
}
}
this->data_collection.push_back( data );
// update performance data
this->parsed_size += line_size;
const auto parseLine = [this]( const std::string& line, const LogsFormat& logs_format ) {
this->data_collection.emplace_back( LogLineData(line, logs_format) );
this->parsed_size += line.size();
this->parsed_lines ++;
this->sendPerfData();
};
@ -427,22 +201,39 @@ void CraplogParser::parseLogLines()
if ( this->proceed ) {
const size_t n_lines{ this->logs_lines.size() };
const size_t nl{ this->logs_format.new_lines };
size_t send{ 0ul };
if ( nl == 0ul ) {
const size_t send_gap{ n_lines>1000ul ? n_lines/100 : n_lines>100ul ? n_lines/10 : 10 };
const LogsFormat& lf {this->logs_format};
this->data_collection.reserve( n_lines );
for ( const std::string& line : this->logs_lines ) {
parseLine( line );
parseLine( line, lf );
if (send == send_gap) {
this->sendPerfData();
send = 0ul;
}
++send;
}
} else {
this->data_collection.reserve( n_lines / (nl+1ul) );
const size_t real_lines{ n_lines / (nl+1ul) };
const size_t send_gap{ real_lines>1000ul ? real_lines/100 : real_lines>100ul ? real_lines/10 : 10 };
const LogsFormat& lf {this->logs_format};
this->data_collection.reserve( real_lines );
for ( size_t i{0ul}; i<n_lines; i++ ) {
std::string line = this->logs_lines.at( i );
for ( size_t n{0ul}; n<nl; n++ ) {
i++;
line += "\n" + this->logs_lines.at( i );
}
parseLine( line );
parseLine( line, lf );
if (send == send_gap) {
this->sendPerfData();
send = 0ul;
}
++send;
}
}
this->sendPerfData();
}
}
@ -487,7 +278,7 @@ void CraplogParser::storeLogLines()
{db_name, stmt_msg, err_msg} );
}
if ( this->proceed && !this->data_collection.empty() ) {
if ( this->proceed ) {
this->proceed &= this->storeData( db );
}
@ -507,7 +298,7 @@ void CraplogParser::storeLogLines()
{db_name, stmt_msg, err_msg} );
}
}
if ( ! proceed ) {
if ( ! this->proceed ) {
// rollback
throw (std::exception());
}
@ -544,9 +335,37 @@ void CraplogParser::storeLogLines()
db.close();
}
}
}
#define APPEND_TO_QUERY_AS_NUMBER(LOG_FIELD)\
query_stmt += QStringLiteral(", ");\
if ( LOG_FIELD ) {\
query_stmt += QString::fromStdString( *LOG_FIELD ).replace("'","''");\
} else {\
query_stmt += QStringLiteral("NULL");\
}
#define APPEND_TO_QUERY_AS_STRING(LOG_FIELD)\
query_stmt += QStringLiteral(", ");\
if ( LOG_FIELD ) {\
query_stmt += QString("'%1'").arg( QString::fromStdString( *LOG_FIELD ).replace("'","''") );\
} else {\
query_stmt += QStringLiteral("NULL");\
}
// in IIS logs the user-agent is logged with '+' instead of ' ' (whitespace)
#define APPEND_TO_QUERY_USER_AGENT(LOG_FIELD)\
query_stmt += QStringLiteral(", ");\
if ( LOG_FIELD ) {\
if ( this->wsID == IIS_ID ) {\
query_stmt += QString("'%1'").arg( QString::fromStdString( *LOG_FIELD ).replace("+"," ").replace("'","''") );\
} else {\
query_stmt += QString("'%1'").arg( QString::fromStdString( *LOG_FIELD ).replace("'","''") );\
}\
} else {\
query_stmt += QStringLiteral("NULL");\
}
bool CraplogParser::storeData( QSqlDatabase& db )
{
const QString db_name{ QString::fromStdString(
@ -556,7 +375,7 @@ bool CraplogParser::storeData( QSqlDatabase& db )
// get blacklist/warnlist items
const bool check_bl_cli { this->blacklists.at( 20 ).used };
const bool check_wl_met { this->warnlists.at( 11 ).used };
const bool check_wl_req { this->warnlists.at( 12 ).used };
const bool check_wl_uri { this->warnlists.at( 12 ).used };
const bool check_wl_cli { this->warnlists.at( 20 ).used };
const bool check_wl_ua { this->warnlists.at( 21 ).used };
@ -569,7 +388,7 @@ bool CraplogParser::storeData( QSqlDatabase& db )
? this->warnlists.at( 11 ).list
: empty };
const std::vector<std::string>& wl_req_list{ (check_wl_req)
const std::vector<std::string>& wl_req_list{ (check_wl_uri)
? this->warnlists.at( 12 ).list
: empty };
@ -603,77 +422,47 @@ bool CraplogParser::storeData( QSqlDatabase& db )
bool warning{ false };
QSqlQuery query{ db };
// parse every row of data
for ( const log_line_data_t& row : this->data_collection ) {
for ( const LogLineData& line_data : this->data_collection ) {
// check blacklisted clients
if ( check_bl_cli ) {
if ( row.find( 20 ) != row.end() ) {
// this row does contain this row item, check if they match
const std::string& target{ row.at( 20 ) };
if ( std::any_of( bl_cli_list.cbegin(), bl_cli_list.cend(),
[&target]( const std::string& item )
{ return target.rfind( item, 0ul ) == 0ul; }) ) {
// append every field to ignored size
this->blacklisted_size += std::accumulate( row.cbegin(), row.cend(), 0ul,
[]( size_t size, const auto& item )
{ return size + item.second.size(); });
continue;
}
if ( check_bl_cli && line_data.client ) {
if ( VecOps::contains( bl_cli_list, *line_data.client ) ) {
this->blacklisted_size += line_data.size();
continue;
}
}
// check warnlisted clients
if ( check_wl_cli ) {
if ( row.find( 20 ) != row.end() ) {
// this row do contains this row item, check if they match
const std::string& target{ row.at( 20 ) };
if ( std::any_of( wl_cli_list.cbegin(), wl_cli_list.cend(),
[&target]( const std::string& item )
{ return target.rfind( item, 0ul ) == 0ul; }) ) {
// match found! put a warning on this line
warning |= true;
}
if ( check_wl_cli && line_data.client ) {
if ( VecOps::contains( wl_cli_list, *line_data.client ) ) {
warning |= true;
goto end_of_warnings_check;
}
}
// check warnlisted user-agents
if ( check_wl_ua && !warning ) {
if ( row.find( 21 ) != row.end() ) {
// this row do contains this row item, check if they match
const std::string& target{ row.at( 21 ) };
if ( std::any_of( wl_ua_list.cbegin(), wl_ua_list.cend(),
[&target]( const std::string& item )
{ return target.rfind( item, 0ul ) == 0ul; }) ) {
// match found! skip this line
warning |= true;
}
if ( check_wl_ua && line_data.user_agent ) {
if ( VecOps::contains( wl_ua_list, *line_data.user_agent ) ) {
// match found! skip this line
warning |= true;
goto end_of_warnings_check;
}
}
// check warnlisted methods
if ( check_wl_met && !warning ) {
if ( row.find( 11 ) != row.end() ) {
// this row do contains this row item, check if they match
const std::string& target{ row.at( 11 ) };
if ( std::any_of( wl_met_list.cbegin(), wl_met_list.cend(),
[&target]( const std::string& item )
{ return item == target; }) ) {
// match found! skip this line
warning |= true;
}
if ( check_wl_met && line_data.method ) {
if ( VecOps::contains( wl_met_list, *line_data.method ) ) {
// match found! skip this line
warning |= true;
goto end_of_warnings_check;
}
}
// check warnlisted requests URIs
if ( check_wl_req && !warning ) {
if ( row.find( 12 ) != row.end() ) {
// this row do contains this row item, check if they match
const std::string& target{ row.at( 12 ) };
if ( std::any_of( wl_req_list.cbegin(), wl_req_list.cend(),
[&target]( const std::string& item )
{ return target.rfind( item, 0ul ) == 0ul; }) ) {
// match found! skip this line
warning |= true;
}
if ( check_wl_uri && line_data.uri ) {
if ( VecOps::contains( wl_req_list, *line_data.uri ) ) {
// match found! skip this line
warning |= true;
}
}
end_of_warnings_check:
// initialize the SQL statement
@ -685,66 +474,36 @@ bool CraplogParser::storeData( QSqlDatabase& db )
// warning
if ( warning ) {
warning &= false;
this->warnlisted_size += std::accumulate( row.cbegin(), row.cend(), 0ul,
[]( size_t size, const auto& item )
{ return size + item.second.size(); });
this->warnlisted_size += line_data.size();
query_stmt += "1";
} else {
query_stmt += "0";
}
// date and time
for ( int i{1}; i<7; i++ ) {
query_stmt += ", ";
if ( row.find( i ) == row.cend() ) {
// no value found in the collection, bind NULL
query_stmt += "NULL";
} else {
// value found, bind it
query_stmt += QString::fromStdString( row.at( i ) ).replace("'","''");
}
}
APPEND_TO_QUERY_AS_NUMBER(line_data.year) // 1
APPEND_TO_QUERY_AS_NUMBER(line_data.month) // 2
APPEND_TO_QUERY_AS_NUMBER(line_data.day) // 3
APPEND_TO_QUERY_AS_NUMBER(line_data.hour) // 4
APPEND_TO_QUERY_AS_NUMBER(line_data.minute) // 5
APPEND_TO_QUERY_AS_NUMBER(line_data.second) // 6
// request
for ( int i{10}; i<14; i++ ) {
query_stmt += ", ";
if ( row.find( i ) == row.cend() ) {
// no value found in the collection, bind NULL
query_stmt += "NULL";
} else {
// value found, bind it
query_stmt += QString("'%1'").arg( QString::fromStdString( row.at( i ) ).replace("'","''") );
}
}
APPEND_TO_QUERY_AS_STRING(line_data.protocol) // 10
APPEND_TO_QUERY_AS_STRING(line_data.method) // 11
APPEND_TO_QUERY_AS_STRING(line_data.uri) // 12
APPEND_TO_QUERY_AS_STRING(line_data.query) // 13
for ( int i{14}; i<18; i++ ) {
query_stmt += ", ";
if ( row.find( i ) == row.cend() ) {
// no value found in the collection, bind NULL
query_stmt += "NULL";
} else {
// value found, bind it
query_stmt += QString::fromStdString( row.at( i ) ).replace("'","''");
}
}
APPEND_TO_QUERY_AS_NUMBER(line_data.response_code) // 14
APPEND_TO_QUERY_AS_NUMBER(line_data.time_taken) // 15
APPEND_TO_QUERY_AS_NUMBER(line_data.bytes_sent) // 16
APPEND_TO_QUERY_AS_NUMBER(line_data.bytes_received) // 17
// client data and referrer
for ( const int& i : std::vector<int>{18,20,21,22} ) {
query_stmt += ", ";
if ( row.find( i ) == row.cend() ) {
// no value found in the collection, bind NULL
query_stmt += "NULL";
} else {
// value found, bind it
if ( i == 21 && this->wsID == IIS_ID ) {
// iis logs the user-agent using '+' instead of ' ' (spaces)
QString str = QString::fromStdString( row.at( i ) ).replace("+"," ");
query_stmt += QString("'%1'").arg( str.replace("'","''") );
} else {
query_stmt += QString("'%1'").arg( QString::fromStdString( row.at( i ) ).replace("'","''") );
}
}
}
APPEND_TO_QUERY_AS_STRING(line_data.referrer) // 18
APPEND_TO_QUERY_AS_STRING(line_data.client) // 20
APPEND_TO_QUERY_USER_AGENT(line_data.user_agent) // 21
APPEND_TO_QUERY_AS_STRING(line_data.cookie) // 22
query_stmt += ");";

View file

@ -2,11 +2,29 @@
#define LOGDOCTOR__CRAPLOG__WORKERS__PARSER_H
#include "parser_interface.h"
#include "modules/craplog/modules/lib.h"
#include <QObject>
#include <unordered_map>
class CraplogParser : public CraplogParserInterface
struct BWlist;
struct LogLineData;
enum class WorkerDialog;
class QSqlDatabase;
class CraplogParser : public QObject
{
Q_OBJECT
using logs_file_t = std::tuple<std::string,std::string>;
using worker_files_t = std::vector<logs_file_t>;
using bw_lists_t = std::unordered_map<int, BWlist>;
public:
explicit CraplogParser(
@ -21,17 +39,84 @@ public:
QObject* parent=nullptr
);
signals:
void perfData(
const size_t parsed_size,
const size_t parsed_lines );
void chartData(
const size_t total_size,
const size_t total_lines,
const size_t warnlisted_size,
const size_t blacklisted_size );
void showDialog(
const WorkerDialog dialog_type,
const QStringList arg );
void startedParsing();
void finishedParsing();
void done( const bool successful );
void retire();
public slots:
void work() override;
void work();
virtual void sendPerfData();
virtual void sendChartData();
private:
const unsigned wsID;
const unsigned dialogs_level;
bool proceed{ true };
///////////////////
//// DATABASES ////
bool db_edited{ false };
std::string db_data_path;
std::string db_hashes_path;
//////////////////////
//// PERFORMANCES ////
size_t total_lines { 0ul };
size_t parsed_lines { 0ul };
size_t total_size { 0ul };
size_t parsed_size { 0ul };
size_t warnlisted_size { 0ul };
size_t blacklisted_size { 0ul };
//////////////////////////////
//// BLACKLIST / WARNLIST ////
// { log_field_id : BWlist }
const bw_lists_t blacklists;
const bw_lists_t warnlists;
//////////////
//// LOGS ////
LogsFormat logs_format;
// the selected log files to be parsed during the process
const worker_files_t files_to_use;
// the entire stack of lines which have been read from the log files
std::vector<std::string> logs_lines;
// collection of logs data, each item represents a log line
std::vector<LogLineData> data_collection;
//! Reads the selected files and append the resulting lines to the list
/*!
\throw GenericException