Re-do how SS ping tests are handled

This moves all the responsibility of ping testing (deciding when it's unreachable, etc.) into oxend, allowing for better reporting on SS ping results and eliminating some edge cases that can lead to oxend and storage server getting "stuck" thinking each is in a different state.
2021-04-13 18:00:00 -03:00 · 2021-04-13 18:00:00 -03:00 · bdebfda9f8
parent 3b4c8f4a5d
commit bdebfda9f8
8 changed files with 140 additions and 25 deletions
--- a/src/cryptonote_config.h
+++ b/src/cryptonote_config.h
@ -241,6 +241,7 @@ namespace config
  inline constexpr auto UPTIME_PROOF_CHECK_INTERVAL = 30s; // How frequently to check whether we need to broadcast a proof
  inline constexpr auto UPTIME_PROOF_FREQUENCY = 1h; // How often to send proofs out to the network since the last proof we successfully sent.  (Approximately; this can be up to CHECK_INTERFACE/2 off in either direction).  The minimum accepted time between proofs is half of this.
  inline constexpr auto UPTIME_PROOF_VALIDITY = 2h + 5min; // The maximum time that we consider an uptime proof to be valid (i.e. after this time since the last proof we consider the SN to be down)
+  inline constexpr auto SS_MAX_FAILURE_VALIDITY = 10min; // If we don't hear any SS ping test failures for more than this long then we start considering the SN as passing for the purpose of obligation testing until we get another test result.  This should be somewhat larger than SS's max re-test backoff (5min).

  // Hash domain separators
  inline constexpr std::string_view HASH_KEY_BULLETPROOF_EXPONENT = "bulletproof"sv;
--- a/src/cryptonote_core/service_node_list.cpp
+++ b/src/cryptonote_core/service_node_list.cpp
@ -3233,23 +3233,48 @@ namespace service_nodes
    info.timesync_status.add(entry);
  }

-  bool service_node_list::set_storage_server_peer_reachable(crypto::public_key const &pubkey, bool value)
+  std::optional<bool> proof_info::ss_reachable(const std::chrono::steady_clock::time_point& now) const {
+    if (ss_last_reachable >= ss_last_unreachable)
+      return true;
+    if (ss_last_unreachable > now - config::SS_MAX_FAILURE_VALIDITY)
+      return false;
+    // Last result was a failure, but it was a while ago, so we don't know for sure that it isn't
+    // reachable now:
+    return std::nullopt;
+  }
+
+  bool proof_info::ss_unreachable_for(std::chrono::seconds threshold, const std::chrono::steady_clock::time_point& now) const {
+    if (auto maybe_reachable = ss_reachable(now); !maybe_reachable /*stale*/ || *maybe_reachable /*good*/)
+      return false;
+    if (ss_first_unreachable > now - threshold)
+      return false; // Unreachable, but for less than the grace time
+    return true;
+  }
+
+  bool service_node_list::set_storage_server_peer_reachable(crypto::public_key const &pubkey, bool reachable)
  {
+    // (See .h for overview description)
+
    std::lock_guard lock(m_sn_mutex);

    if (!m_state.service_nodes_infos.count(pubkey)) {
-      LOG_PRINT_L2("No Service Node is known by this pubkey: " << pubkey);
+      MDEBUG("Dropping SS reachable report: " << pubkey << " is not a registered SN pubkey");
      return false;
    }

-    proof_info &info = proofs[pubkey];
-    if (info.storage_server_reachable != value)
-    {
-      info.storage_server_reachable = value;
-      LOG_PRINT_L2("Setting reachability status for node " << pubkey << " as: " << (value ? "true" : "false"));
+    MDEBUG("Received " << (reachable ? "reachable" : "UNREACHABLE") << " report for SN " << pubkey);
+
+    const auto now = std::chrono::steady_clock::now();
+    proof_info& info = proofs[pubkey];
+    if (reachable) {
+      info.ss_last_reachable = now;
+      info.ss_first_unreachable = NEVER;
+    } else {
+      info.ss_last_unreachable = now;
+      if (info.ss_first_unreachable == NEVER)
+        info.ss_first_unreachable = now;
    }

-    info.storage_server_reachable_timestamp = time(nullptr);
    return true;
  }

--- a/src/cryptonote_core/service_node_list.h
+++ b/src/cryptonote_core/service_node_list.h
@ -28,6 +28,7 @@

 #pragma once

+#include <chrono>
 #include <mutex>
 #include <shared_mutex>
 #include <string_view>
@ -139,6 +140,8 @@ namespace service_nodes
    ValueType const *end()   const { return array.data() + std::min(array.size(), write_index); }
  };

+  inline constexpr auto NEVER = std::chrono::steady_clock::time_point::min();
+
  struct proof_info
  {
    proof_info();
@ -152,8 +155,20 @@ namespace service_nodes
    uint64_t effective_timestamp = 0; // Typically the same, but on recommissions it is set to the recommission block time to fend off instant obligation checks
    std::array<std::pair<uint32_t, uint64_t>, 2> public_ips = {}; // (not serialized)

-    bool storage_server_reachable               = true;
-    uint64_t storage_server_reachable_timestamp = 0;
+    // See set_storage_server_peer_reachable(...)
+    std::chrono::steady_clock::time_point ss_last_reachable = NEVER;
+    std::chrono::steady_clock::time_point ss_first_unreachable = NEVER;
+    std::chrono::steady_clock::time_point ss_last_unreachable = NEVER;
+    // Returns whether or not this SS is currently (probably) reachable:
+    // - true if the last test was a pass (regardless of how long ago)
+    // - false if the last test was a recent fail (i.e. less than SS_MAX_FAILURE_VALIDITY ago)
+    // - nullopt if the last test was a failure, but is considered stale.
+    // Both true and nullopt are considered a pass for service node testing.
+    std::optional<bool> ss_reachable(const std::chrono::steady_clock::time_point& now = std::chrono::steady_clock::now()) const;
+    // Returns true if this node's SS has recently failed reachability (see above) *and* has been
+    // unreachable for at least the given grace time (that is: there is both a recent failure and a
+    // failure more than `grace` ago, with no intervening reachability pass reports).
+    bool ss_unreachable_for(std::chrono::seconds threshold, const std::chrono::steady_clock::time_point& now = std::chrono::steady_clock::now()) const;

    // Unlike all of the above (except for timestamp), these values *do* get serialized
    std::unique_ptr<uptime_proof::Proof> proof;
@ -545,6 +560,41 @@ namespace service_nodes
    // Called every hour to remove proofs for expired SNs from memory and the database.
    void cleanup_proofs();

+    // Called via RPC from storage server to report a ping test result for a remote storage server.
+    //
+    // How this works (as of SS 2.0.9/oxen 9.x):
+    // - SS randomly picks probably-good nodes to test every 10s (with fuzz), and pings
+    //   known-failing nodes to re-test them.
+    // - SS re-tests nodes with a linear backoff: 10s+fuzz after the first failure, then 20s+fuzz,
+    //   then 30s+fuzz, etc. (up to ~5min retest intervals)
+    // - Whenever SS gets *any* ping result at all it notifies us via RPC (which lands here), and it
+    //   is (as of 9.x) our responsibility to decide when too many bad pings should be penalized.
+    //
+    // Our rules are as follows:
+    // - if we have received only failures for more than 1h5min *and* we have at least one failure
+    //   in the last 10min then we consider SS reachability to be failing.
+    // - otherwise we consider it good.  (Which means either it passed a reachability test at least
+    //   once in the last 1h5min *or* SS stopped pinging it, perhaps because it restarted).
+    //
+    // We do all this by tracking three values:
+    // - ss_last_reachable
+    // - ss_first_unreachable
+    // - ss_last_unreachable
+    //
+    // On a good ping, we set last_reachable to the current time and clear first_unreachable.  On a
+    // bad ping we set last_unreachable to the current time and, if first_unreachable is empty, set
+    // it to current time as well.
+    //
+    // This then lets us figure out:
+    // - current status can be good (first_unreachable == 0), passable (last_unreachable < 10min ago), or failing.
+    // - current *failing* status (current status == failing && first_unreachable more than 1h5min ago)
+    // - last test time (max(last_reachable, last_unreachable), "not yet" if this is 0)
+    // - last test result (last_reachable >= last_unreachable)
+    // - how long it has been unreachable (now - first_unreachable, if first_unreachable is set)
+    //
+    // (Also note that the actual times references here are for convenience, 10min is actually
+    // SS_MAX_FAILURE_VALIDITY, and 1h5min is actually UPTIME_PROOF_VALIDITY-UPTIME_PROOF_FREQUENCY
+    // (which is actually 11min on testnet rather than 1h5min)).
    bool set_storage_server_peer_reachable(crypto::public_key const &pubkey, bool value);

    struct quorum_for_serialization
--- a/src/cryptonote_core/service_node_quorum_cop.cpp
+++ b/src/cryptonote_core/service_node_quorum_cop.cpp
@ -100,7 +100,7 @@ namespace service_nodes
    bool check_timestamp_obligation = false;

    m_core.get_service_node_list().access_proof(pubkey, [&](const proof_info &proof) {
-      ss_reachable             = proof.storage_server_reachable;
+      ss_reachable             = !proof.ss_unreachable_for(netconf.UPTIME_PROOF_VALIDITY - netconf.UPTIME_PROOF_FREQUENCY);
      timestamp                = std::max(proof.proof->timestamp, proof.effective_timestamp);
      ips                      = proof.public_ips;
      checkpoint_participation = proof.checkpoint_participation;
--- a/src/daemon/rpc_command_executor.cpp
+++ b/src/daemon/rpc_command_executor.cpp
@ -1684,12 +1684,27 @@ static void append_printable_service_node_list_entry(cryptonote::network_type ne
    //
    // NOTE: Storage Server Test
    //
-    stream << indent2 << "Storage Server Reachable: " << (entry.storage_server_reachable ? "Yes" : "No") << " (";
-    if (entry.storage_server_reachable_timestamp == 0)
-      stream << "Awaiting first test";
-    else
-      stream << "Last checked: " << get_human_time_ago(entry.storage_server_reachable_timestamp, now);
-    stream << ")\n";
+    stream << indent2 << "Storage Server Reachable: ";
+    if (entry.storage_server_first_unreachable == 0) {
+      if (entry.storage_server_last_reachable == 0)
+        stream << "Not yet tested";
+      else {
+        stream << "Yes (last tested " << get_human_time_ago(entry.storage_server_last_reachable, now);
+        if (entry.storage_server_last_unreachable)
+          stream << "; last failure " << get_human_time_ago(entry.storage_server_last_unreachable, now);
+        stream << ")";
+      }
+    } else {
+      stream << "NO";
+      if (!entry.storage_server_reachable)
+        stream << " - FAILING!";
+      stream << " (last tested " << get_human_time_ago(entry.storage_server_last_unreachable, now)
+        << "; failing since " << get_human_time_ago(entry.storage_server_first_unreachable, now);
+      if (entry.storage_server_last_reachable)
+        stream << "; last good " << get_human_time_ago(entry.storage_server_last_reachable, now);
+      stream << ")";
+    }
+    stream << "\n";

    //
    // NOTE: Component Versions
--- a/src/rpc/core_rpc_server.cpp
+++ b/src/rpc/core_rpc_server.cpp
@ -3028,6 +3028,16 @@ namespace cryptonote { namespace rpc {
    res.status = STATUS_OK;
    return res;
  }
+
+  static time_t reachable_to_time_t(
+      std::chrono::steady_clock::time_point t,
+      std::chrono::system_clock::time_point system_now,
+      std::chrono::steady_clock::time_point steady_now) {
+    if (t == service_nodes::NEVER)
+      return 0;
+    return std::chrono::system_clock::to_time_t(system_now + (t - steady_now));
+  }
+
  //------------------------------------------------------------------------------------------------------------------------------
  void core_rpc_server::fill_sn_response_entry(GET_SERVICE_NODES::response::entry& entry, const service_nodes::service_node_pubkey_info &sn_info, uint64_t current_height) {

@ -3046,22 +3056,26 @@ namespace cryptonote { namespace rpc {
    entry.last_decommission_reason_consensus_all      = info.last_decommission_reason_consensus_all;
    entry.last_decommission_reason_consensus_any      = info.last_decommission_reason_consensus_any;

-    m_core.get_service_node_list().access_proof(sn_info.pubkey, [&entry](const auto &proof) {
+    auto& netconf = m_core.get_net_config();
+    m_core.get_service_node_list().access_proof(sn_info.pubkey, [&entry, &netconf](const auto &proof) {
        entry.service_node_version     = proof.proof->version;
        entry.lokinet_version          = proof.proof->lokinet_version;
        entry.storage_server_version   = proof.proof->storage_server_version;
        entry.public_ip                = epee::string_tools::get_ip_string_from_int32(proof.proof->public_ip);
        entry.storage_port             = proof.proof->storage_https_port;
        entry.storage_lmq_port         = proof.proof->storage_omq_port;
-        entry.storage_server_reachable = proof.storage_server_reachable;
        entry.pubkey_ed25519           = proof.proof->pubkey_ed25519 ? tools::type_to_hex(proof.proof->pubkey_ed25519) : "";
        entry.pubkey_x25519            = proof.pubkey_x25519 ? tools::type_to_hex(proof.pubkey_x25519) : "";
        entry.quorumnet_port           = proof.proof->qnet_port;

        // NOTE: Service Node Testing
        entry.last_uptime_proof                  = proof.proof->timestamp;
-        entry.storage_server_reachable           = proof.storage_server_reachable;
-        entry.storage_server_reachable_timestamp = proof.storage_server_reachable_timestamp;
+        auto system_now = std::chrono::system_clock::now();
+        auto steady_now = std::chrono::steady_clock::now();
+        entry.storage_server_reachable = !proof.ss_unreachable_for(netconf.UPTIME_PROOF_VALIDITY - netconf.UPTIME_PROOF_FREQUENCY, steady_now);
+        entry.storage_server_first_unreachable = reachable_to_time_t(proof.ss_first_unreachable, system_now, steady_now);
+        entry.storage_server_last_unreachable = reachable_to_time_t(proof.ss_last_unreachable, system_now, steady_now);
+        entry.storage_server_last_reachable = reachable_to_time_t(proof.ss_last_reachable, system_now, steady_now);

        service_nodes::participation_history<service_nodes::participation_entry> const &checkpoint_participation = proof.checkpoint_participation;
        service_nodes::participation_history<service_nodes::participation_entry> const &pulse_participation      = proof.pulse_participation;
--- a/src/rpc/core_rpc_server_commands_defs.cpp
+++ b/src/rpc/core_rpc_server_commands_defs.cpp
@ -1116,7 +1116,9 @@ KV_SERIALIZE_MAP_CODE_BEGIN(GET_SERVICE_NODES::requested_fields_t)

    KV_SERIALIZE(last_uptime_proof)
    KV_SERIALIZE(storage_server_reachable)
-    KV_SERIALIZE(storage_server_reachable_timestamp)
+    KV_SERIALIZE(storage_server_first_unreachable)
+    KV_SERIALIZE(storage_server_last_unreachable)
+    KV_SERIALIZE(storage_server_last_reachable)
    KV_SERIALIZE(checkpoint_participation)
    KV_SERIALIZE(pulse_participation)
  }
@ -1166,7 +1168,9 @@ KV_SERIALIZE_MAP_CODE_BEGIN(GET_SERVICE_NODES::response::entry)
  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(pubkey_x25519);
  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(last_uptime_proof);
  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(storage_server_reachable);
-  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(storage_server_reachable_timestamp);
+  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(storage_server_first_unreachable)
+  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(storage_server_last_unreachable)
+  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(storage_server_last_reachable)
  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(checkpoint_participation);
  KV_SERIALIZE_ENTRY_FIELD_IF_REQUESTED(pulse_participation);
 KV_SERIALIZE_MAP_CODE_END()
--- a/src/rpc/core_rpc_server_commands_defs.h
+++ b/src/rpc/core_rpc_server_commands_defs.h
@ -2052,6 +2052,9 @@ namespace rpc {
      bool last_uptime_proof;
      bool storage_server_reachable;
      bool storage_server_reachable_timestamp;
+      bool storage_server_last_reachable;
+      bool storage_server_last_unreachable;
+      bool storage_server_first_unreachable;
      bool checkpoint_participation;
      bool pulse_participation;

@ -2111,8 +2114,11 @@ namespace rpc {

        // Service Node Testing
        uint64_t                                last_uptime_proof;                   // The last time this Service Node's uptime proof was relayed by at least 1 Service Node other than itself in unix epoch time.
-        bool                                    storage_server_reachable;            // Whether the node's storage server has been reported as unreachable for a long time
-        uint64_t                                storage_server_reachable_timestamp;  // The last time this Service Node's storage server was contacted
+        bool                                    storage_server_reachable;            // True if this storage server is currently passing tests for the purposes of SN node testing: true if the last test passed, or if it has been unreachable for less than an hour; false if it has been failing tests for more than an hour (and thus is considered unreachable).
+        uint64_t                                storage_server_first_unreachable;    // If the last test we received was a failure, this field contains the timestamp when failures started.  Will be 0 if the last result was a success or the node has not yet been tested.  (To disinguish between these cases check storage_server_last_reachable).
+        uint64_t                                storage_server_last_unreachable;     // The last time this Service Node failed a ping test (regardless of whether or not it is currently failing); 0 if it never failed a test since startup.
+        uint64_t                                storage_server_last_reachable;       // The last time we received a successful ping response for this Service Node (whether or not it is currently failing); 0 if we have never received a success since startup.
+
        std::vector<service_nodes::participation_entry> checkpoint_participation;    // Of the last N checkpoints the Service Node is in a checkpointing quorum, record whether or not the Service Node voted to checkpoint a block
        std::vector<service_nodes::participation_entry> pulse_participation;         // Of the last N pulse blocks the Service Node is in a pulse quorum, record whether or not the Service Node voted (participated) in that block
        std::vector<service_nodes::timestamp_participation_entry> timestamp_participation;         // Of the last N timestamp messages, record whether or not the Service Node was in sync with the network