Report local force-closing connections to Prometheus (#5575)

* Report local force-closing connections to Prometheus

* Also report ping timeouts separately

* Address concerns of #5571
This commit is contained in:
Pierre Krieger
2020-04-09 11:24:01 +02:00
committed by GitHub
parent 2faabff007
commit cebd073649
5 changed files with 16 additions and 6 deletions
+1
View File
@@ -77,6 +77,7 @@ pub mod sync;
pub use block_requests::BlockRequests;
pub use light_client_handler::LightClientHandler;
pub use generic_proto::LegacyConnectionKillError;
const REQUEST_TIMEOUT_SEC: u64 = 40;
/// Interval at which we perform time based maintenance
@@ -21,6 +21,7 @@
//! network, then performs the Substrate protocol handling on top.
pub use self::behaviour::{GenericProto, GenericProtoOut};
pub use self::handler::LegacyConnectionKillError;
mod behaviour;
mod handler;
@@ -1193,10 +1193,10 @@ impl NetworkBehaviour for GenericProto {
}
NotifsHandlerOut::ProtocolError { error, .. } => {
warn!(target: "sub-libp2p",
debug!(target: "sub-libp2p",
"Handler({:?}) => Severe protocol error: {:?}",
source, error);
// A severe protocol error happens when we detect a "bad" peer, such as a per on
// A severe protocol error happens when we detect a "bad" peer, such as a peer on
// a different chain, or a peer that doesn't speak the same protocol(s). We
// decrease the peer's reputation, hence lowering the chances we try this peer
// again in the short term.
@@ -15,6 +15,7 @@
// along with Substrate. If not, see <http://www.gnu.org/licenses/>.
pub use self::group::{NotifsHandlerProto, NotifsHandler, NotifsHandlerIn, NotifsHandlerOut};
pub use self::legacy::ConnectionKillError as LegacyConnectionKillError;
mod group;
mod legacy;
+11 -4
View File
@@ -33,13 +33,14 @@ use crate::{
NetworkState, NotConnectedPeer as NetworkStateNotConnectedPeer, Peer as NetworkStatePeer,
},
on_demand_layer::AlwaysBadChecker,
protocol::{self, event::Event, light_client_handler, sync::SyncState, PeerInfo, Protocol},
protocol::{self, event::Event, light_client_handler, LegacyConnectionKillError, sync::SyncState, PeerInfo, Protocol},
transport, ReputationChange,
};
use futures::prelude::*;
use libp2p::{PeerId, Multiaddr};
use libp2p::core::{ConnectedPoint, Executor, connection::{ConnectionError, PendingConnectionError}};
use libp2p::core::{ConnectedPoint, Executor, connection::{ConnectionError, PendingConnectionError}, either::EitherError};
use libp2p::kad::record;
use libp2p::ping::handler::PingFailure;
use libp2p::swarm::{NetworkBehaviour, SwarmBuilder, SwarmEvent, protocols_handler::NodeHandlerWrapperError};
use log::{error, info, trace, warn};
use parking_lot::Mutex;
@@ -871,7 +872,7 @@ impl Metrics {
connections: register(GaugeVec::new(
Opts::new(
"sub_libp2p_connections",
"Number of active libp2p connections"
"Number of established libp2p connections"
),
&["direction"]
)?, registry)?,
@@ -974,7 +975,7 @@ impl Metrics {
pending_connections_errors_total: register(CounterVec::new(
Opts::new(
"sub_libp2p_pending_connections_errors_total",
"Total number of node connection failures"
"Total number of pending connection errors"
),
&["reason"]
)?, registry)?,
@@ -1134,6 +1135,12 @@ impl<B: BlockT + 'static, H: ExHashT> Future for NetworkWorker<B, H> {
metrics.connections_closed_total.with_label_values(&["transport-error"]).inc(),
ConnectionError::ConnectionLimit(_) =>
metrics.connections_closed_total.with_label_values(&["limit-reached"]).inc(),
ConnectionError::Handler(NodeHandlerWrapperError::Handler(EitherError::A(EitherError::A(
EitherError::A(EitherError::B(EitherError::A(PingFailure::Timeout))))))) =>
metrics.connections_closed_total.with_label_values(&["ping-timeout"]).inc(),
ConnectionError::Handler(NodeHandlerWrapperError::Handler(EitherError::A(EitherError::A(
EitherError::A(EitherError::A(EitherError::B(LegacyConnectionKillError))))))) =>
metrics.connections_closed_total.with_label_values(&["force-closed"]).inc(),
ConnectionError::Handler(NodeHandlerWrapperError::Handler(_)) =>
metrics.connections_closed_total.with_label_values(&["protocol-error"]).inc(),
ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) =>