feat: add stagedsync headers metrics (#498)

* add dockerfile for reth

* Add docker compose for prometheus

* Add some metrics

* Add p2p connection errors metric

* Add dependency caching for dockerfile

This reduces image build times by ~50% after the first one.

Uses cargo-chef inside the image.
More info in: https://morioh.com/p/987a2bda4526

* Add --metrics flag to docker-compose config file

* add Cargo.lock

* Move docker-compose.yml to docker directory

* Apply formatting

* Remove docker folder

* Remove .dockerignore file

* Add 'reth' prefix to metric names

* Add headers errors and request time metrics

* Modularize metrics exporter functionality and describe metrics

* Format files

* Add metrics documentation in metrics.md

* Fix metrics doc title

* Commit changes after rebase

* Solve conflict

* Modularize metrics describers

* Add stages_metrics_describer

* Rearrange header error metrics

* Add update_headers_metrics function

* Add one-line docs to describers

* Remove commented line

* Refactor metrics describer

* Update metrics doc

* Fix import

* Add header metrics struct

* add new metrics format in the headers execute method

* Add default implementation for HeaderMetrics

* Fix typo

* Fix another typo

* Fix more typos

* Move new HeaderMetrics meathod to default

* Solve conflicts

* Fix test

Co-authored-by: Tomás <tomas.gruner@lambdaclass.com>
This commit is contained in:
Mariano A. Nicolini
2022-12-22 11:45:57 -03:00
committed by GitHub
parent 7ce22fd186
commit b12939db47
10 changed files with 161 additions and 46 deletions

33
Cargo.lock generated
View File

@ -88,9 +88,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anyhow"
version = "1.0.67"
version = "1.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7724808837b77f4b4de9d283820f9d98bcf496d5692934b857a2399d31ff22e6"
checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61"
[[package]]
name = "aquamarine"
@ -395,7 +395,7 @@ checksum = "982a0cf6a99c350d7246035613882e376d58cebe571785abc5da4f648d53ac0a"
dependencies = [
"camino",
"cargo-platform",
"semver 1.0.15",
"semver 1.0.16",
"serde",
"serde_json",
"thiserror",
@ -1270,7 +1270,7 @@ dependencies = [
[[package]]
name = "ethers-core"
version = "1.0.2"
source = "git+https://github.com/gakonst/ethers-rs#bb4af1c13478ab368e89be143e79b553eb956922"
source = "git+https://github.com/gakonst/ethers-rs#91cd6ccce8c71c82f462d8a8838a94087565623a"
dependencies = [
"arrayvec",
"bytes",
@ -1295,7 +1295,7 @@ dependencies = [
[[package]]
name = "ethers-providers"
version = "1.0.2"
source = "git+https://github.com/gakonst/ethers-rs#bb4af1c13478ab368e89be143e79b553eb956922"
source = "git+https://github.com/gakonst/ethers-rs#91cd6ccce8c71c82f462d8a8838a94087565623a"
dependencies = [
"async-trait",
"auto_impl",
@ -3014,9 +3014,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.48"
version = "1.0.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9d89e5dba24725ae5678020bf8f1357a9aa7ff10736b551adbcd3f8d17d766f"
checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
dependencies = [
"unicode-ident",
]
@ -3060,9 +3060,9 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.22"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "556d0f47a940e895261e77dc200d5eadfc6ef644c179c6f5edfc105e3a2292c8"
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
dependencies = [
"proc-macro2",
]
@ -3292,6 +3292,7 @@ dependencies = [
"dirs-next",
"eyre",
"futures",
"metrics",
"metrics-exporter-prometheus",
"metrics-util",
"reth-consensus",
@ -3934,7 +3935,7 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
dependencies = [
"semver 1.0.15",
"semver 1.0.16",
]
[[package]]
@ -4122,9 +4123,9 @@ dependencies = [
[[package]]
name = "semver"
version = "1.0.15"
version = "1.0.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bfa246f936730408c0abee392cc1a50b118ece708c7f630516defd64480c7d8"
checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a"
dependencies = [
"serde",
]
@ -4172,9 +4173,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.90"
version = "1.0.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8778cc0b528968fe72abec38b5db5a20a70d148116cd9325d2bc5f5180ca3faf"
checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883"
dependencies = [
"itoa",
"ryu",
@ -4492,9 +4493,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.106"
version = "1.0.107"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ee3a69cd2c7e06684677e5629b3878b253af05e4714964204279c6bc02cf0b"
checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
dependencies = [
"proc-macro2",
"quote",

View File

@ -36,6 +36,7 @@ dirs-next = "2.0.0"
confy = "0.5"
# rpc/metrics
metrics = "0.20.1"
metrics-exporter-prometheus = { version = "0.11.0", features = ["http-listener"] }
metrics-util = "0.14.0"

View File

@ -11,5 +11,6 @@ pub mod config;
pub mod db;
pub mod dirs;
pub mod node;
pub mod prometheus_exporter;
pub mod test_eth_chain;
pub mod util;

View File

@ -4,12 +4,10 @@
use crate::{
config::Config,
dirs::{ConfigPath, DbPath},
prometheus_exporter,
util::chainspec::{chain_spec_value_parser, ChainSpecification, Genesis},
};
use clap::{crate_version, Parser};
use eyre::WrapErr;
use metrics_exporter_prometheus::PrometheusBuilder;
use metrics_util::layers::{PrefixLayer, Stack};
use reth_consensus::BeaconConsensus;
use reth_db::{
cursor::DbCursorRO,
@ -27,8 +25,10 @@ use reth_network::{
};
use reth_primitives::{Account, Header, H256};
use reth_provider::{db_provider::ProviderImpl, BlockProvider, HeaderProvider};
use reth_stages::stages::{
bodies::BodyStage, headers::HeaderStage, sender_recovery::SenderRecoveryStage,
use reth_stages::{
stages::{bodies::BodyStage, headers::HeaderStage, sender_recovery::SenderRecoveryStage},
stages_metrics::HeaderMetrics,
stages_metrics_describer,
};
use std::{net::SocketAddr, path::Path, sync::Arc};
use tracing::{debug, info};
@ -93,15 +93,8 @@ impl Command {
if let Some(listen_addr) = self.metrics {
info!("Starting metrics endpoint at {}", listen_addr);
let (recorder, exporter) = PrometheusBuilder::new()
.with_http_listener(listen_addr)
.build()
.wrap_err("Could not build Prometheus endpoint.")?;
tokio::task::spawn(exporter);
Stack::new(recorder)
.push(PrefixLayer::new("reth"))
.install()
.wrap_err("Couldn't set metrics recorder.")?;
prometheus_exporter::initialize(listen_addr)?;
stages_metrics_describer::describe();
}
let chain_id = self.chain.consensus.chain_id;
@ -125,6 +118,7 @@ impl Command {
client: fetch_client.clone(),
network_handle: network.clone(),
commit_threshold: config.stages.headers.commit_threshold,
metrics: HeaderMetrics::default(),
})
.push(BodyStage {
downloader: Arc::new(

View File

@ -0,0 +1,20 @@
//! Prometheus exporter
use eyre::WrapErr;
use metrics_exporter_prometheus::PrometheusBuilder;
use metrics_util::layers::{PrefixLayer, Stack};
use std::net::SocketAddr;
pub(crate) fn initialize(listen_addr: SocketAddr) -> eyre::Result<()> {
let (recorder, exporter) = PrometheusBuilder::new()
.with_http_listener(listen_addr)
.build()
.wrap_err("Could not build Prometheus endpoint.")?;
tokio::task::spawn(exporter);
Stack::new(recorder)
.push(PrefixLayer::new("reth"))
.install()
.wrap_err("Couldn't set metrics recorder.")?;
Ok(())
}

View File

@ -27,6 +27,12 @@ mod test_utils;
/// Implementations of stages.
pub mod stages;
/// Describers for stages metrics.
pub mod stages_metrics_describer;
/// Stages metrics.
pub mod stages_metrics;
pub use db::Transaction;
pub use error::*;
pub use id::*;

View File

@ -1,6 +1,6 @@
use crate::{
db::Transaction, DatabaseIntegrityError, ExecInput, ExecOutput, Stage, StageError, StageId,
UnwindInput, UnwindOutput,
db::Transaction, stages_metrics, DatabaseIntegrityError, ExecInput, ExecOutput, Stage,
StageError, StageId, UnwindInput, UnwindOutput,
};
use futures_util::StreamExt;
use reth_db::{
@ -21,6 +21,7 @@ use reth_interfaces::{
},
};
use reth_primitives::{BlockNumber, Header, SealedHeader, H256, U256};
use stages_metrics::HeaderMetrics;
use std::{fmt::Debug, sync::Arc};
use tracing::*;
@ -53,6 +54,8 @@ pub struct HeaderStage<D: HeaderDownloader, C: Consensus, H: HeadersClient, S: S
pub network_handle: S,
/// The number of block headers to commit at once
pub commit_threshold: u64,
/// Header metrics
pub metrics: HeaderMetrics,
}
#[async_trait::async_trait]
@ -81,7 +84,6 @@ impl<DB: Database, D: HeaderDownloader, C: Consensus, H: HeadersClient, S: Statu
let mut current_progress = stage_progress;
let mut stream =
self.downloader.stream(head.clone(), tip).chunks(self.commit_threshold as usize);
// The stage relies on the downloader to return the headers
// in descending order starting from the tip down to
// the local head (latest block in db)
@ -89,6 +91,7 @@ impl<DB: Database, D: HeaderDownloader, C: Consensus, H: HeadersClient, S: Statu
match headers.into_iter().collect::<Result<Vec<_>, _>>() {
Ok(res) => {
info!(target: "sync::stages::headers", len = res.len(), "Received headers");
self.metrics.headers_counter.increment(res.len() as u64);
// Perform basic response validation
self.validate_header_response(&res)?;
@ -96,20 +99,23 @@ impl<DB: Database, D: HeaderDownloader, C: Consensus, H: HeadersClient, S: Statu
self.write_headers::<DB>(tx, res).await?.unwrap_or_default();
current_progress = current_progress.max(write_progress);
}
Err(e) => match e {
DownloadError::Timeout => {
warn!(target: "sync::stages::headers", "No response for header request");
return Err(StageError::Recoverable(DownloadError::Timeout.into()))
Err(e) => {
self.metrics.update_headers_error_metrics(&e);
match e {
DownloadError::Timeout => {
warn!(target: "sync::stages::headers", "No response for header request");
return Err(StageError::Recoverable(DownloadError::Timeout.into()))
}
DownloadError::HeaderValidation { hash, error } => {
error!(target: "sync::stages::headers", ?error, ?hash, "Validation error");
return Err(StageError::Validation { block: stage_progress, error })
}
error => {
error!(target: "sync::stages::headers", ?error, "Unexpected error");
return Err(StageError::Recoverable(error.into()))
}
}
DownloadError::HeaderValidation { hash, error } => {
error!(target: "sync::stages::headers", ?error, ?hash, "Validation error");
return Err(StageError::Validation { block: stage_progress, error })
}
error => {
error!(target: "sync::stages::headers", ?error, "Unexpected error");
return Err(StageError::Recoverable(error.into()))
}
},
}
}
}
@ -448,6 +454,7 @@ mod tests {
mod test_runner {
use crate::{
stages::headers::HeaderStage,
stages_metrics::HeaderMetrics,
test_utils::{
ExecuteStageTestRunner, StageTestRunner, TestRunnerError, TestTransaction,
UnwindStageTestRunner,
@ -502,6 +509,7 @@ mod tests {
downloader: self.downloader.clone(),
network_handle: self.network_handle.clone(),
commit_threshold: 100,
metrics: HeaderMetrics::default(),
}
}
}

View File

@ -0,0 +1,46 @@
use metrics::{register_counter, Counter};
use reth_interfaces::p2p::error::DownloadError;
use std::fmt;
/// Stagedsync header metrics
pub struct HeaderMetrics {
/// Number of headers successfully retrieved
pub headers_counter: Counter,
/// Number of timeout errors while requesting headers
pub headers_timeout_errors: Counter,
/// Number of validation errors while requesting headers
pub headers_validation_errors: Counter,
/// Elapsed time of successful header requests
pub headers_unexpected_errors: Counter,
}
impl HeaderMetrics {
/// Update header errors metrics
pub fn update_headers_error_metrics(&self, error: &DownloadError) {
match error {
DownloadError::Timeout => self.headers_timeout_errors.increment(1),
DownloadError::HeaderValidation { hash: _, error: _ } => {
self.headers_validation_errors.increment(1)
}
_error => self.headers_unexpected_errors.increment(1),
}
}
}
impl Default for HeaderMetrics {
/// Initialize header metrics struct and register them
fn default() -> Self {
Self {
headers_counter: register_counter!("stages.headers.counter"),
headers_timeout_errors: register_counter!("stages.headers.timeout_errors"),
headers_validation_errors: register_counter!("stages.headers.validation_errors"),
headers_unexpected_errors: register_counter!("stages.headers.unexpected_errors"),
}
}
}
impl fmt::Debug for HeaderMetrics {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("HeaderMetrics").finish()
}
}

View File

@ -0,0 +1,27 @@
use metrics::{describe_counter, describe_histogram};
/// Describe stagedsync headers metrics
fn describe_header_metrics() {
describe_counter!("stages.headers.counter", "Number of headers successfully retrieved");
describe_counter!(
"stages.headers.timeout_errors",
"Number of timeout errors while requesting headers"
);
describe_counter!(
"stages.headers.validation_errors",
"Number of validation errors while requesting headers"
);
describe_counter!(
"stages.headers.unexpected_errors",
"Number of unexpected errors while requesting headers"
);
describe_histogram!(
"stages.headers.request_time",
"Elapsed time of successful header requests"
);
}
/// Describe stagedsync metrics
pub fn describe() {
describe_header_metrics();
}

View File

@ -16,6 +16,9 @@ The main difference between metrics and traces is therefore that metrics are sys
### How to add a metric
To add metrics use the [`metrics`][metrics] crate.
1. Add the code emitting the metric.
2. Add the metrics description in the crate's metrics describer module, e.g.: [stages metrics describer](https://github.com/paradigmxyz/reth/blob/main/crates/stages/src/stages_metrics_describer.rs).
3. Document the metric in this file.
#### Metric anatomy
@ -53,6 +56,14 @@ How the metrics are exposed to the end-user is determined by the CLI.
[^1]: The top-level namespace is added by the CLI using [`metrics_util::layers::PrefixLayer`][metrics_util.PrefixLayer].
### Current metrics
#### StagedSync Headers
- `stages.headers.counter`: Number of headers successfully retrieved
- `stages.headers.timeout_error`: Number of timeout errors while requesting headers
- `stages.headers.validation_errors`: Number of validation errors while requesting headers
- `stages.headers.unexpected_errors`: Number of unexpected errors while requesting headers
- `stages.headers.request_time`: Elapsed time of successful header requests
[metrics]: https://docs.rs/metrics
[metrics.Key]: https://docs.rs/metrics/latest/metrics/struct.Key.html
[metrics.KeyName]: https://docs.rs/metrics/latest/metrics/struct.KeyName.html