From babf73612f592e655b9fc84d971c543bc02ccc28 Mon Sep 17 00:00:00 2001 From: Matthias Seitz Date: Thu, 5 Jan 2023 13:09:38 +0100 Subject: [PATCH] feat(net): add granular backoff durations (#729) * feat(net): add granular backoff durations * update backoff durations --- crates/net/network/src/error.rs | 99 +++++++++++++++++++------ crates/net/network/src/peers/manager.rs | 85 ++++++++++++++++----- 2 files changed, 142 insertions(+), 42 deletions(-) diff --git a/crates/net/network/src/error.rs b/crates/net/network/src/error.rs index 0eee7a27a..61a177cd3 100644 --- a/crates/net/network/src/error.rs +++ b/crates/net/network/src/error.rs @@ -5,7 +5,7 @@ use reth_eth_wire::{ errors::{EthHandshakeError, EthStreamError, P2PHandshakeError, P2PStreamError}, DisconnectReason, }; -use std::fmt; +use std::{fmt, io::ErrorKind}; /// All error variants for the network #[derive(Debug, thiserror::Error)] @@ -34,9 +34,33 @@ pub(crate) trait SessionError: fmt::Debug { /// of the gossip network. fn is_fatal_protocol_error(&self) -> bool; - /// Returns true if the error should lead to backoff, temporarily preventing additional - /// connection attempts - fn should_backoff(&self) -> bool; + /// Whether we should backoff. + /// + /// Returns the severity of the backoff that should be applied, or `None`, if no backoff should + /// be applied. + /// + /// In case of `Some(BackoffKind)` will temporarily prevent additional + /// connection attempts. + fn should_backoff(&self) -> Option; +} + +/// Describes the type of backoff should be applied. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BackoffKind { + /// Use the lowest configured backoff duration. + /// + /// This applies to connection problems where there is a chance that they will be resolved + /// after the short duration. + Low, + /// Use a slightly higher duration to put a peer in timeout + /// + /// This applies to more severe connection problems where there is a lower chance that they + /// will be resolved. + Medium, + /// Use the max configured backoff duration. + /// + /// This is intended for spammers, or bad peers in general. + High, } impl SessionError for EthStreamError { @@ -88,22 +112,51 @@ impl SessionError for EthStreamError { } } - fn should_backoff(&self) -> bool { - matches!( - self, + fn should_backoff(&self) -> Option { + if let Some(err) = self.as_io() { + return match err.kind() { + // these usually happen when the remote instantly drops the connection, for example + // if the previous connection isn't properly cleaned up yet and the peer is temp. + // banned. + ErrorKind::ConnectionRefused | + ErrorKind::ConnectionReset | + ErrorKind::BrokenPipe => Some(BackoffKind::Low), + _ => Some(BackoffKind::Medium), + } + } + + if let Some(err) = self.as_disconnected() { + return match err { + DisconnectReason::TooManyPeers | + DisconnectReason::AlreadyConnected | + DisconnectReason::TcpSubsystemError => Some(BackoffKind::Low), + _ => { + // These are considered fatal, and are handled by the + // [`SessionError::is_fatal_protocol_error`] + Some(BackoffKind::High) + } + } + } + + // This only checks for a subset of error variants, the counterpart of + // [`SessionError::is_fatal_protocol_error`] + match self { + // timeouts EthStreamError::EthHandshakeError(EthHandshakeError::NoResponse) | - EthStreamError::P2PStreamError(P2PStreamError::HandshakeError( - P2PHandshakeError::NoResponse - )) - ) || self.as_io().is_some() || - self.as_disconnected() - .map(|reason| { - matches!( - reason, - DisconnectReason::TooManyPeers | DisconnectReason::AlreadyConnected - ) - }) - .unwrap_or_default() + EthStreamError::P2PStreamError(P2PStreamError::HandshakeError( + P2PHandshakeError::NoResponse, + )) | + EthStreamError::P2PStreamError(P2PStreamError::PingTimeout) => Some(BackoffKind::Low), + // malformed messages + EthStreamError::P2PStreamError(P2PStreamError::Rlp(_)) | + EthStreamError::P2PStreamError(P2PStreamError::UnknownReservedMessageId(_)) | + EthStreamError::P2PStreamError(P2PStreamError::UnknownDisconnectReason(_)) | + EthStreamError::P2PStreamError(P2PStreamError::MessageTooBig { .. }) | + EthStreamError::P2PStreamError(P2PStreamError::EmptyProtocolMessage) | + EthStreamError::P2PStreamError(P2PStreamError::PingerError(_)) | + EthStreamError::P2PStreamError(P2PStreamError::Snap(_)) => Some(BackoffKind::Medium), + _ => None, + } } } @@ -122,10 +175,10 @@ impl SessionError for PendingSessionHandshakeError { } } - fn should_backoff(&self) -> bool { + fn should_backoff(&self) -> Option { match self { PendingSessionHandshakeError::Eth(eth) => eth.should_backoff(), - PendingSessionHandshakeError::Ecies(_) => true, + PendingSessionHandshakeError::Ecies(_) => Some(BackoffKind::Low), } } } @@ -152,11 +205,11 @@ mod tests { )); assert_eq!(err.as_disconnected(), Some(DisconnectReason::TooManyPeers)); - assert!(err.should_backoff()); + assert_eq!(err.should_backoff(), Some(BackoffKind::Low)); let err = EthStreamError::P2PStreamError(P2PStreamError::HandshakeError( P2PHandshakeError::NoResponse, )); - assert!(err.should_backoff()); + assert_eq!(err.should_backoff(), Some(BackoffKind::Low)); } } diff --git a/crates/net/network/src/peers/manager.rs b/crates/net/network/src/peers/manager.rs index f2005b656..639c2c665 100644 --- a/crates/net/network/src/peers/manager.rs +++ b/crates/net/network/src/peers/manager.rs @@ -1,5 +1,5 @@ use crate::{ - error::SessionError, + error::{BackoffKind, SessionError}, peers::{ reputation::{is_banned_reputation, BACKOFF_REPUTATION_CHANGE, DEFAULT_REPUTATION}, ReputationChangeKind, ReputationChangeWeights, @@ -92,7 +92,7 @@ pub(crate) struct PeersManager { ban_duration: Duration, /// How long peers to which we could not connect for non-fatal reasons, e.g. /// [`DisconnectReason::TooManyPeers`], are put in time out. - backoff_duration: Duration, + backoff_durations: PeerBackoffDurations, /// If non-trusted peers should be connected to connect_trusted_nodes_only: bool, } @@ -106,7 +106,7 @@ impl PeersManager { reputation_weights, ban_list, ban_duration, - backoff_duration, + backoff_durations, trusted_nodes, connect_trusted_nodes_only, .. @@ -115,7 +115,7 @@ impl PeersManager { let now = Instant::now(); // We use half of the interval to decrease the max duration to `150%` in worst case - let unban_interval = ban_duration.min(backoff_duration) / 2; + let unban_interval = ban_duration.min(backoff_durations.low) / 2; let mut peers = HashMap::with_capacity(trusted_nodes.len()); @@ -137,7 +137,7 @@ impl PeersManager { connection_info, ban_list, ban_duration, - backoff_duration, + backoff_durations, connect_trusted_nodes_only, } } @@ -248,9 +248,9 @@ impl PeersManager { } /// Temporarily puts the peer in timeout - fn backoff_peer(&mut self, peer_id: PeerId) { + fn backoff_peer(&mut self, peer_id: PeerId, kind: BackoffKind) { trace!(target: "net::peers", ?peer_id, "backing off"); - self.ban_list.ban_peer_until(peer_id, std::time::Instant::now() + self.backoff_duration); + self.ban_list.ban_peer_until(peer_id, self.backoff_durations.backoff_until(kind)); } /// Unbans the peer @@ -367,10 +367,10 @@ impl PeersManager { }) } } else { - let reputation_change = if err.should_backoff() { + let reputation_change = if let Some(kind) = err.should_backoff() { // The peer has signaled that it is currently unable to process any more // connections, so we will hold off on attempting any new connections for a while - self.backoff_peer(*peer_id); + self.backoff_peer(*peer_id, kind); BACKOFF_REPUTATION_CHANGE.into() } else { self.reputation_weights.change(reputation_change) @@ -892,7 +892,7 @@ pub struct PeersConfig { pub ban_duration: Duration, /// How long to backoff peers that are we failed to connect to for non-fatal reasons, such as /// [`DisconnectReason::TooManyPeers`]. - pub backoff_duration: Duration, + pub backoff_durations: PeerBackoffDurations, /// Trusted nodes to connect to. pub trusted_nodes: HashSet, /// Connect to trusted nodes only? @@ -908,8 +908,7 @@ impl Default for PeersConfig { ban_list: Default::default(), // Ban peers for 12h ban_duration: Duration::from_secs(60 * 60 * 12), - // backoff peers for 1h - backoff_duration: Duration::from_secs(60 * 60), + backoff_durations: Default::default(), trusted_nodes: Default::default(), connect_trusted_nodes_only: false, } @@ -958,6 +957,52 @@ impl PeersConfig { } } +/// The durations to use when a backoff should be applied to a peer. +/// +/// See also [`BackoffKind`](BackoffKind). +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))] +pub struct PeerBackoffDurations { + /// Applies to connection problems where there is a chance that they will be resolved after the + /// short duration. + pub low: Duration, + /// Applies to more severe connection problems where there is a lower chance that they will be + /// resolved. + pub medium: Duration, + /// Intended for spammers, or bad peers in general. + pub high: Duration, +} + +impl PeerBackoffDurations { + /// Returns the corresponding [`Duration`] + pub fn backoff(&self, kind: BackoffKind) -> Duration { + match kind { + BackoffKind::Low => self.low, + BackoffKind::Medium => self.medium, + BackoffKind::High => self.high, + } + } + + /// Returns the timestamp until which we should backoff + pub fn backoff_until(&self, kind: BackoffKind) -> std::time::Instant { + let now = std::time::Instant::now(); + now + self.backoff(kind) + } +} + +impl Default for PeerBackoffDurations { + fn default() -> Self { + Self { + low: Duration::from_secs(30), + // 3min + medium: Duration::from_secs(60 * 3), + // 15min + high: Duration::from_secs(60 * 15), + } + } +} + #[derive(Debug, Error)] pub enum InboundConnectionError { ExceedsLimit(usize), @@ -975,7 +1020,7 @@ mod test { use super::PeersManager; use crate::{ peers::{ - manager::{ConnectionInfo, PeerConnectionState}, + manager::{ConnectionInfo, PeerBackoffDurations, PeerConnectionState}, PeerAction, ReputationChangeKind, }, session::PendingSessionHandshakeError, @@ -1064,8 +1109,9 @@ mod test { let peer = PeerId::random(); let socket_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 1, 2)), 8008); - let backoff_duration = Duration::from_secs(3); - let config = PeersConfig { backoff_duration, ..Default::default() }; + let backoff_durations = + PeerBackoffDurations { low: Duration::from_secs(3), ..Default::default() }; + let config = PeersConfig { backoff_durations, ..Default::default() }; let mut peers = PeersManager::new(config); peers.add_peer(peer, socket_addr); @@ -1105,7 +1151,7 @@ mod test { assert!(peers.ban_list.is_banned_peer(&peer)); assert!(peers.peers.get(&peer).is_some()); - tokio::time::sleep(backoff_duration).await; + tokio::time::sleep(backoff_durations.low).await; match event!(peers) { PeerAction::UnBanPeer { peer_id, .. } => { @@ -1127,8 +1173,9 @@ mod test { let peer = PeerId::random(); let socket_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 1, 2)), 8008); - let backoff_duration = Duration::from_secs(3); - let config = PeersConfig { backoff_duration, ..Default::default() }; + let backoff_durations = + PeerBackoffDurations { high: Duration::from_secs(3), ..Default::default() }; + let config = PeersConfig { backoff_durations, ..Default::default() }; let mut peers = PeersManager::new(config); peers.add_peer(peer, socket_addr); @@ -1168,7 +1215,7 @@ mod test { assert!(peers.ban_list.is_banned_peer(&peer)); assert!(peers.peers.get(&peer).is_some()); - tokio::time::sleep(backoff_duration).await; + tokio::time::sleep(backoff_durations.high).await; match event!(peers) { PeerAction::UnBanPeer { peer_id, .. } => {