316 lines
8.5 KiB
C++
316 lines
8.5 KiB
C++
#ifndef FSTREAM
|
|
#define FSTREAM
|
|
#include <fstream>
|
|
#endif
|
|
|
|
#ifndef IOSTREAM
|
|
#define IOSTREAM
|
|
#include <iostream>
|
|
#endif
|
|
|
|
bool reader::file::init(std::string path) {
|
|
if (!file_exists(path)) {
|
|
init_error_message = "File '" + path + "' does not exist";
|
|
return false;
|
|
}
|
|
|
|
file_name = path;
|
|
|
|
std::ifstream file_stream(file_name, std::ifstream::binary);
|
|
|
|
if (!file_stream) {
|
|
init_error_message = "Could not open file '" + file_name + "' for reading";
|
|
return false;
|
|
}
|
|
|
|
// get length of file
|
|
file_stream.seekg(0, file_stream.end);
|
|
file_length = file_stream.tellg();
|
|
|
|
initialized = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
std::string reader::file::read(int64_t start_position, int64_t length) {
|
|
|
|
std::ifstream file_stream(file_name, std::ifstream::binary);
|
|
|
|
// allocate memory
|
|
std::string buffer(length, ' ');
|
|
|
|
// set position
|
|
file_stream.seekg(start_position);
|
|
|
|
// read data as a block
|
|
file_stream.read(&buffer[0], length);
|
|
|
|
file_stream.close();
|
|
|
|
return buffer;
|
|
}
|
|
|
|
bool reader::file::do_read_job() {
|
|
/*
|
|
If just_outputting_positions == true,
|
|
Then we don't need to actually read the file,
|
|
Just output in the format "startposition endposition"
|
|
*/
|
|
if (just_outputting_positions) {
|
|
std::cout << start_position << " " << end_position-1 << std::endl;
|
|
return true;
|
|
}
|
|
|
|
for (int64_t i = start_position; i < end_position; (i = i + block_size)) {
|
|
int64_t amount_left_in_file = (end_position - i);
|
|
|
|
if (block_size > amount_left_in_file) {
|
|
block_size = amount_left_in_file;
|
|
}
|
|
|
|
std::cout << read(i, block_size);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool reader::file::do_normal_search() {
|
|
int64_t match_start = 0;
|
|
int64_t match_end = 0;
|
|
|
|
// Only call the .length() function ONCE and store it
|
|
// To save on resources
|
|
int search_query_length = search_query.length();
|
|
|
|
if (block_size <= search_query_length) {
|
|
// If the block size is smaller than the length of the search query,
|
|
// The user is just being silly.
|
|
// We'll reset it to 1 larger than the length of the search query
|
|
block_size = search_query_length + 1;
|
|
}
|
|
|
|
int shift_by_this_much = (block_size - search_query_length);
|
|
|
|
for (int64_t i = start_position; i < end_position; (i = i + shift_by_this_much)) {
|
|
int64_t amount_left_in_file = (end_position - i);
|
|
|
|
if (block_size > amount_left_in_file || shift_by_this_much > amount_left_in_file) {
|
|
block_size = amount_left_in_file;
|
|
shift_by_this_much = amount_left_in_file;
|
|
}
|
|
|
|
std::string block_data = read(i, block_size);
|
|
|
|
// Check if the WHOLE search query is in the block
|
|
// And if so, just output it
|
|
size_t search_result = block_data.find(search_query);
|
|
|
|
// Next cycle if no match was found
|
|
if (search_result == -1) {
|
|
continue;
|
|
}
|
|
// Start & end position RELATIVE to this block
|
|
start_position = search_result;
|
|
end_position = (start_position + search_query_length);
|
|
|
|
// ABSOLUTE start & end position
|
|
// (that is, relative to the start of the file)
|
|
match_start = (i + start_position);
|
|
match_end = (match_start + search_query_length);
|
|
|
|
if (just_outputting_positions) {
|
|
std::cout << match_start << " " << match_end-1 << std::endl;
|
|
return true;
|
|
}
|
|
|
|
std::cout << block_data.substr(start_position, search_query_length) << std::endl;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool reader::file::do_regex_search() {
|
|
/***
|
|
A std::regex search in Polonius should happen this way:
|
|
0. Validate the regular expression (TODO: expression currently not validated before running)
|
|
1. Parse the regular expression into its component parts
|
|
e.g.:
|
|
Expression: 'abc[a-z]+235'
|
|
Should be split into:
|
|
'a' 'b' 'c'
|
|
'[a-z]+'
|
|
'2' '3' '5'
|
|
And recombined into multiple expressions:
|
|
a. 'abc[a-z]+235'
|
|
b. 'abc[a-z]+23'
|
|
c. 'abc[a-z]+2'
|
|
d. 'abc[a-z]+'
|
|
e. 'abc'
|
|
f. 'ab'
|
|
g. 'a'
|
|
2. Search the loaded block (of size block_size) for a match for the full expression (a)
|
|
If found, skip to the final step
|
|
If not found:
|
|
3. Search: Does the loaded block END WITH a partial match? (Any of the expressions listed above after expression a)
|
|
If no, load block #2 and go back to step #2
|
|
If yes:
|
|
4. Load a new block (of size block_size) STARTING FROM the start position of the aforementioned partial match, and
|
|
restart the process HERE from step #2
|
|
|
|
Final:
|
|
Report the found match
|
|
|
|
One important restriction is that we will be limited to finding std::regex matches no longer than the user-specified block size
|
|
(default 10KB)
|
|
|
|
TODO:
|
|
At the moment, it's also possible to construct situations in which there is a match present, but Polonius will not be able to find it.
|
|
Consider the expression:
|
|
([C-Z]{2})E
|
|
Run on a file with the contents:
|
|
0ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
|
With a block size of 4 bytes
|
|
|
|
The proper match would be:
|
|
CDE
|
|
3 5
|
|
|
|
Polonius scans the first block for a match of the full expression:
|
|
0ABC
|
|
None found, it checks if it ends for a PARTIAL MATCH of the expression's first component ([C-Z]{2}), and finds that it does not
|
|
Polonius then moves on to the next block:
|
|
DEFG
|
|
It finds that this block ends with a partial match ([C-Z]{2})... etc
|
|
But, we've already missed our full match (CDE)
|
|
***/
|
|
int64_t match_start = 0;
|
|
int64_t match_end = 0;
|
|
|
|
std::vector<std::string> sub_expressions = create_sub_expressions(search_query);
|
|
|
|
for (int64_t current_index = start_position; current_index < end_position; (current_index = current_index + block_size)) {
|
|
regex_scan:
|
|
int64_t amount_left_in_file = (end_position - current_index);
|
|
|
|
if (block_size > amount_left_in_file) {
|
|
block_size = amount_left_in_file;
|
|
}
|
|
|
|
std::string block_data = read(current_index, block_size);
|
|
std::smatch regex_search_result;
|
|
std::regex expression(search_query);
|
|
|
|
bool full_match_found = regex_search(block_data, regex_search_result, expression);
|
|
|
|
if (!full_match_found) {
|
|
for (int64_t j = 0; j < sub_expressions.size(); j++) {
|
|
std::smatch sub_expression_search_result;
|
|
std::regex sub_expression(sub_expressions[j] + R"($)"); // 'R"($)"' signifies that the std::string must END with the match
|
|
|
|
// Partial match found?
|
|
bool partial_match_found = regex_search(block_data, sub_expression_search_result, sub_expression);
|
|
int64_t partial_match_position = sub_expression_search_result.prefix().length();
|
|
|
|
if (partial_match_found && partial_match_position > 0) {
|
|
current_index = current_index + partial_match_position;
|
|
goto regex_scan;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
// ABSOLUTE start & end position
|
|
// (that is, relative to the start of the file)
|
|
match_start = current_index + (regex_search_result.prefix().length());
|
|
match_end = current_index + (block_size - regex_search_result.suffix().length());
|
|
|
|
if (just_outputting_positions) {
|
|
std::cout << match_start << " " << match_end-1 << std::endl;
|
|
return true;
|
|
}
|
|
|
|
std::cout << regex_search_result[0] << std::endl;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool reader::file::do_search_job() {
|
|
if (query_type == t_normal_search) {
|
|
return do_normal_search();
|
|
} else if (query_type == t_regex_search) {
|
|
return do_regex_search();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool reader::file::do_job() {
|
|
if (!initialized) {
|
|
std::cout << "Error reading file" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Make sure we're not trying to read outside the bounds of the file
|
|
// And fail open if so
|
|
|
|
// First check the start position
|
|
if (start_position > file_length) {
|
|
// And just read the last byte
|
|
start_position = (file_length - 2);
|
|
}
|
|
|
|
// Now check the end position
|
|
end_position = (start_position + amount_to_read);
|
|
|
|
if (end_position > file_length) {
|
|
// Just set it to read till the end of the file & not further
|
|
amount_to_read = (file_length - start_position);
|
|
end_position = (start_position + amount_to_read);
|
|
}
|
|
|
|
if (job == read_job) {
|
|
return do_read_job();
|
|
}
|
|
|
|
if (job == search_job) {
|
|
return do_search_job();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
std::string reader::file::get_init_error_message() {
|
|
return init_error_message;
|
|
}
|
|
|
|
int64_t reader::file::get_file_length() {
|
|
return file_length;
|
|
}
|
|
|
|
void reader::file::set_start_position(int64_t position) {
|
|
start_position = position;
|
|
}
|
|
|
|
void reader::file::set_amount_to_read(int64_t amount) {
|
|
amount_to_read = amount;
|
|
}
|
|
|
|
void reader::file::set_just_outputting_positions(bool flag) {
|
|
just_outputting_positions = flag;
|
|
}
|
|
|
|
void reader::file::set_block_size(int size) {
|
|
block_size = size;
|
|
}
|
|
|
|
void reader::file::set_search_query(std::string query) {
|
|
search_query = query;
|
|
}
|
|
|
|
void reader::file::set_search_type(search_type normal_or_regex) {
|
|
query_type = normal_or_regex;
|
|
}
|
|
|
|
void reader::file::set_job_type(job_type input_job) {
|
|
job = input_job;
|
|
}
|