mirror of
https://github.com/oxen-io/lokinet
synced 2023-12-14 06:53:00 +01:00
f3deabdb96
* add remove_node_from_failing to remove a node by pubkey from the failing set * if a router is deregistered we remove it from the failing set so we don't retest it * remove a router from the failing set if we get a test success
148 lines
6.1 KiB
C++
148 lines
6.1 KiB
C++
#pragma once
|
|
|
|
#include <chrono>
|
|
#include <queue>
|
|
#include <random>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include <llarp/util/time.hpp>
|
|
#include <llarp/router_id.hpp>
|
|
|
|
namespace llarp
|
|
{
|
|
struct AbstractRouter;
|
|
}
|
|
|
|
namespace llarp::consensus
|
|
{
|
|
namespace detail
|
|
{
|
|
using clock_t = std::chrono::steady_clock;
|
|
using time_point_t = std::chrono::time_point<clock_t>;
|
|
|
|
// Returns std::greater on the std::get<N>(v)th element value.
|
|
template <typename T, size_t N>
|
|
struct nth_greater
|
|
{
|
|
constexpr bool
|
|
operator()(const T& lhs, const T& rhs) const
|
|
{
|
|
return std::greater<std::tuple_element_t<N, T>>{}(std::get<N>(lhs), std::get<N>(rhs));
|
|
}
|
|
};
|
|
|
|
struct incoming_test_state
|
|
{
|
|
time_point_t last_test{};
|
|
time_point_t last_whine{};
|
|
bool was_failing = false;
|
|
};
|
|
|
|
} // namespace detail
|
|
using time_point_t = detail::time_point_t;
|
|
using clock_t = detail::clock_t;
|
|
|
|
// How often we tick the timer to check whether we need to do any tests.
|
|
constexpr auto REACHABILITY_TESTING_TIMER_INTERVAL = 50ms;
|
|
|
|
class reachability_testing
|
|
{
|
|
public:
|
|
// Distribution for the seconds between node tests: we throw in some randomness to avoid
|
|
// potential clustering of tests. (Note that there is some granularity here as the test timer
|
|
// only runs every REACHABILITY_TESTING_TIMER_INTERVAL).
|
|
std::normal_distribution<float> TESTING_INTERVAL{10.0, 3.0};
|
|
|
|
// The linear backoff after each consecutive test failure before we re-test. Specifically we
|
|
// schedule the next re-test for (TESTING_BACKOFF*previous_failures) + TESTING_INTERVAL(rng).
|
|
inline static constexpr auto TESTING_BACKOFF = 10s;
|
|
|
|
// The upper bound for the re-test interval.
|
|
inline static constexpr auto TESTING_BACKOFF_MAX = 2min;
|
|
|
|
// The maximum number of nodes that we will re-test at once (i.e. per TESTING_TIMING_INTERVAL);
|
|
// mainly intended to throttle ourselves if, for instance, our own connectivity loss makes us
|
|
// accumulate tons of nodes to test all at once. (Despite the random intervals, this can happen
|
|
// if we also get decommissioned during which we can't test at all but still have lots of
|
|
// failing nodes we want to test right away when we get recommissioned).
|
|
inline static constexpr int MAX_RETESTS_PER_TICK = 4;
|
|
|
|
// Maximum time without a ping before we start whining about it.
|
|
//
|
|
// We have a probability of about 0.368* of *not* getting pinged within a ping interval (10s),
|
|
// and so the probability of not getting a ping for 2 minutes (i.e. 12 test spans) just because
|
|
// we haven't been selected is extremely small (0.0000061). It also coincides nicely with
|
|
// blockchain time (i.e. two minutes) and our max testing backoff.
|
|
//
|
|
// * = approx value of ((n-1)/n)^n for non-tiny values of n
|
|
inline static constexpr auto MAX_TIME_WITHOUT_PING = 2min;
|
|
|
|
// How often we whine in the logs about being unreachable
|
|
inline static constexpr auto WHINING_INTERVAL = 2min;
|
|
|
|
private:
|
|
// Queue of pubkeys of service nodes to test; we pop off the back of this until the queue
|
|
// empties then we refill it with a shuffled list of all pubkeys then pull off of it until it is
|
|
// empty again, etc.
|
|
std::vector<RouterID> testing_queue;
|
|
|
|
// The next time for a general test
|
|
time_point_t next_general_test = time_point_t::min();
|
|
|
|
// When we started, so that we know not to hold off on whining about no pings for a while.
|
|
const time_point_t startup = clock_t::now();
|
|
|
|
// Pubkeys, next test times, and sequential failure counts of service nodes that are currently
|
|
// in "failed" status along with the last time they failed; we retest them first after 10s then
|
|
// back off linearly by an additional 10s up to a max testing interval of 2m30s, until we get a
|
|
// successful response.
|
|
using FailingPK = std::tuple<RouterID, time_point_t, int>;
|
|
std::priority_queue<FailingPK, std::vector<FailingPK>, detail::nth_greater<FailingPK, 1>>
|
|
failing_queue;
|
|
std::unordered_set<RouterID> failing;
|
|
|
|
// Track the last time *this node* was tested by other network nodes; used to detect and warn
|
|
// about possible network issues.
|
|
detail::incoming_test_state last;
|
|
|
|
public:
|
|
// If it is time to perform another random test, this returns the next node to test from the
|
|
// testing queue and returns it, also updating the timer for the next test. If it is not yet
|
|
// time, or if the queue is empty and cannot current be replenished, returns std::nullopt. If
|
|
// the queue empties then this builds a new one by shuffling current public keys in the swarm's
|
|
// "all nodes" then starts using the new queue for this an subsequent calls.
|
|
//
|
|
// `requeue` is mainly for internal use: if false it avoids rebuilding the queue if we run
|
|
// out (and instead just return nullopt).
|
|
std::optional<RouterID>
|
|
next_random(
|
|
AbstractRouter* router, const time_point_t& now = clock_t::now(), bool requeue = true);
|
|
|
|
// Removes and returns up to MAX_RETESTS_PER_TICK nodes that are due to be tested (i.e.
|
|
// next-testing-time <= now). Returns [snrecord, #previous-failures] for each.
|
|
std::vector<std::pair<RouterID, int>>
|
|
get_failing(const time_point_t& now = clock_t::now());
|
|
|
|
// Adds a bad node pubkey to the failing list, to be re-tested soon (with a backoff depending on
|
|
// `failures`; see TESTING_BACKOFF). `previous_failures` should be the number of previous
|
|
// failures *before* this one, i.e. 0 for a random general test; or the failure count returned
|
|
// by `get_failing` for repeated failures.
|
|
void
|
|
add_failing_node(const RouterID& pk, int previous_failures = 0);
|
|
|
|
/// removes the public key from the failing set
|
|
void
|
|
remove_node_from_failing(const RouterID& pk);
|
|
|
|
// Called when this router receives an incomming session
|
|
void
|
|
incoming_ping(const time_point_t& now = clock_t::now());
|
|
|
|
// Check whether we received incoming pings recently
|
|
void
|
|
check_incoming_tests(const time_point_t& now = clock_t::now());
|
|
};
|
|
|
|
} // namespace llarp::consensus
|