Make ETL file size configurable (#6927)

Co-authored-by: Alexey Shekhirin <a.shekhirin@gmail.com>
Co-authored-by: joshieDo <ranriver@protonmail.com>
This commit is contained in:
Misha
2024-03-13 14:51:30 +01:00
committed by GitHub
parent c9c269b64f
commit 5d6ac4c815
14 changed files with 87 additions and 14 deletions

View File

@ -127,6 +127,7 @@ impl Command {
header_downloader, header_downloader,
body_downloader, body_downloader,
factory.clone(), factory.clone(),
stage_conf.etl.etl_file_size,
) )
.set(SenderRecoveryStage { .set(SenderRecoveryStage {
commit_threshold: stage_conf.sender_recovery.commit_threshold, commit_threshold: stage_conf.sender_recovery.commit_threshold,

View File

@ -178,6 +178,9 @@ impl ImportCommand {
reth_revm::EvmProcessorFactory::new(self.chain.clone(), EthEvmConfig::default()); reth_revm::EvmProcessorFactory::new(self.chain.clone(), EthEvmConfig::default());
let max_block = file_client.max_block().unwrap_or(0); let max_block = file_client.max_block().unwrap_or(0);
let etl_file_size = config.stages.etl.etl_file_size;
let mut pipeline = Pipeline::builder() let mut pipeline = Pipeline::builder()
.with_tip_sender(tip_tx) .with_tip_sender(tip_tx)
// we want to sync all blocks the file client provides or 0 if empty // we want to sync all blocks the file client provides or 0 if empty
@ -190,6 +193,7 @@ impl ImportCommand {
header_downloader, header_downloader,
body_downloader, body_downloader,
factory.clone(), factory.clone(),
etl_file_size,
) )
.set(SenderRecoveryStage { .set(SenderRecoveryStage {
commit_threshold: config.stages.sender_recovery.commit_threshold, commit_threshold: config.stages.sender_recovery.commit_threshold,

View File

@ -82,6 +82,10 @@ pub struct Command {
#[arg(long)] #[arg(long)]
batch_size: Option<u64>, batch_size: Option<u64>,
/// The maximum size in bytes of data held in memory before being flushed to disk as a file.
#[arg(long)]
etl_file_size: Option<usize>,
/// Normally, running the stage requires unwinding for stages that already /// Normally, running the stage requires unwinding for stages that already
/// have been run, in order to not rewrite to the same database slots. /// have been run, in order to not rewrite to the same database slots.
/// ///
@ -151,6 +155,8 @@ impl Command {
let batch_size = self.batch_size.unwrap_or(self.to - self.from + 1); let batch_size = self.batch_size.unwrap_or(self.to - self.from + 1);
let etl_file_size = self.etl_file_size.unwrap_or(500 * 1024 * 1024);
let (mut exec_stage, mut unwind_stage): (Box<dyn Stage<_>>, Option<Box<dyn Stage<_>>>) = let (mut exec_stage, mut unwind_stage): (Box<dyn Stage<_>>, Option<Box<dyn Stage<_>>>) =
match self.stage { match self.stage {
StageEnum::Bodies => { StageEnum::Bodies => {
@ -229,7 +235,7 @@ impl Command {
) )
} }
StageEnum::TxLookup => { StageEnum::TxLookup => {
(Box::new(TransactionLookupStage::new(batch_size, None)), None) (Box::new(TransactionLookupStage::new(batch_size, etl_file_size, None)), None)
} }
StageEnum::AccountHashing => { StageEnum::AccountHashing => {
(Box::new(AccountHashingStage::new(1, batch_size)), None) (Box::new(AccountHashingStage::new(1, batch_size)), None)

View File

@ -238,7 +238,7 @@ RPC:
--rpc-max-tracing-requests <COUNT> --rpc-max-tracing-requests <COUNT>
Maximum number of concurrent tracing requests Maximum number of concurrent tracing requests
[default: 8] [default: 10]
--rpc-max-blocks-per-filter <COUNT> --rpc-max-blocks-per-filter <COUNT>
Maximum number of blocks that could be scanned per filter request. (0 = entire chain) Maximum number of blocks that could be scanned per filter request. (0 = entire chain)

View File

@ -61,6 +61,9 @@ Options:
--batch-size <BATCH_SIZE> --batch-size <BATCH_SIZE>
Batch size for stage execution and unwind Batch size for stage execution and unwind
--etl-file-size <ETL_FILE_SIZE>
Size for temporary file during ETL stages
-s, --skip-unwind -s, --skip-unwind
Normally, running the stage requires unwinding for stages that already have been run, in order to not rewrite to the same database slots. Normally, running the stage requires unwinding for stages that already have been run, in order to not rewrite to the same database slots.

View File

@ -221,6 +221,19 @@ The storage history indexing stage builds an index of what blocks a particular s
commit_threshold = 100000 commit_threshold = 100000
``` ```
### `etl`
An ETL (extract, transform, load) data collector. Used mainly to insert data into `MDBX` in a sorted manner.
```toml
[stages.etl]
# The maximum size in bytes of data held in memory before being flushed to disk as a file.
#
# Lower threshold corresponds to more frequent flushes,
# but lowers temporary storage usage
file_size = 524_288_000 # 500 * 1024 * 1024
```
## The `[peers]` section ## The `[peers]` section
The peers section is used to configure how the networking component of reth establishes and maintains connections to peers. The peers section is used to configure how the networking component of reth establishes and maintains connections to peers.

View File

@ -70,6 +70,8 @@ pub struct StageConfig {
pub index_account_history: IndexHistoryConfig, pub index_account_history: IndexHistoryConfig,
/// Index Storage History stage configuration. /// Index Storage History stage configuration.
pub index_storage_history: IndexHistoryConfig, pub index_storage_history: IndexHistoryConfig,
/// Common ETL related configuration.
pub etl: EtlConfig,
} }
/// Header stage configuration. /// Header stage configuration.
@ -235,7 +237,21 @@ impl Default for TransactionLookupConfig {
} }
} }
/// History History stage configuration. /// Common ETL related configuration.
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Serialize)]
#[serde(default)]
pub struct EtlConfig {
/// The maximum size in bytes of data held in memory before being flushed to disk as a file.
pub etl_file_size: usize,
}
impl Default for EtlConfig {
fn default() -> Self {
Self { etl_file_size: 500 * (1024 * 1024) }
}
}
/// History stage configuration.
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Serialize)] #[derive(Debug, Clone, Copy, Deserialize, PartialEq, Serialize)]
#[serde(default)] #[serde(default)]
pub struct IndexHistoryConfig { pub struct IndexHistoryConfig {

View File

@ -406,6 +406,7 @@ where
header_downloader, header_downloader,
body_downloader, body_downloader,
executor_factory.clone(), executor_factory.clone(),
500 * (1024 * 1024),
)) ))
} }
}; };

View File

@ -837,6 +837,7 @@ impl NodeConfig {
header_downloader, header_downloader,
body_downloader, body_downloader,
factory.clone(), factory.clone(),
stage_config.etl.etl_file_size,
) )
.set(SenderRecoveryStage { .set(SenderRecoveryStage {
commit_threshold: stage_config.sender_recovery.commit_threshold, commit_threshold: stage_config.sender_recovery.commit_threshold,
@ -870,6 +871,7 @@ impl NodeConfig {
.set(MerkleStage::new_execution(stage_config.merkle.clean_threshold)) .set(MerkleStage::new_execution(stage_config.merkle.clean_threshold))
.set(TransactionLookupStage::new( .set(TransactionLookupStage::new(
stage_config.transaction_lookup.chunk_size, stage_config.transaction_lookup.chunk_size,
stage_config.etl.etl_file_size,
prune_modes.transaction_lookup, prune_modes.transaction_lookup,
)) ))
.set(IndexAccountHistoryStage::new( .set(IndexAccountHistoryStage::new(

View File

@ -57,7 +57,7 @@ fn transaction_lookup(c: &mut Criterion) {
let mut group = c.benchmark_group("Stages"); let mut group = c.benchmark_group("Stages");
// don't need to run each stage for that many times // don't need to run each stage for that many times
group.sample_size(10); group.sample_size(10);
let stage = TransactionLookupStage::new(DEFAULT_NUM_BLOCKS, None); let stage = TransactionLookupStage::new(DEFAULT_NUM_BLOCKS, 500 * 1024 * 1024, None);
let db = setup::txs_testdata(DEFAULT_NUM_BLOCKS); let db = setup::txs_testdata(DEFAULT_NUM_BLOCKS);

View File

@ -59,6 +59,7 @@
//! headers_downloader, //! headers_downloader,
//! bodies_downloader, //! bodies_downloader,
//! executor_factory, //! executor_factory,
//! 500*1024*1024,
//! ) //! )
//! ) //! )
//! .build(provider_factory, static_file_producer); //! .build(provider_factory, static_file_producer);

View File

@ -100,6 +100,7 @@ impl<Provider, H, B, EF> DefaultStages<Provider, H, B, EF> {
header_downloader: H, header_downloader: H,
body_downloader: B, body_downloader: B,
executor_factory: EF, executor_factory: EF,
etl_file_size: usize,
) -> Self ) -> Self
where where
EF: ExecutorFactory, EF: ExecutorFactory,
@ -111,6 +112,7 @@ impl<Provider, H, B, EF> DefaultStages<Provider, H, B, EF> {
consensus, consensus,
header_downloader, header_downloader,
body_downloader, body_downloader,
etl_file_size,
), ),
executor_factory, executor_factory,
} }
@ -162,6 +164,8 @@ pub struct OnlineStages<Provider, H, B> {
header_downloader: H, header_downloader: H,
/// The block body downloader /// The block body downloader
body_downloader: B, body_downloader: B,
/// The size of temporary files in bytes for ETL data collector.
etl_file_size: usize,
} }
impl<Provider, H, B> OnlineStages<Provider, H, B> { impl<Provider, H, B> OnlineStages<Provider, H, B> {
@ -172,8 +176,9 @@ impl<Provider, H, B> OnlineStages<Provider, H, B> {
consensus: Arc<dyn Consensus>, consensus: Arc<dyn Consensus>,
header_downloader: H, header_downloader: H,
body_downloader: B, body_downloader: B,
etl_file_size: usize,
) -> Self { ) -> Self {
Self { provider, header_mode, consensus, header_downloader, body_downloader } Self { provider, header_mode, consensus, header_downloader, body_downloader, etl_file_size }
} }
} }
@ -198,9 +203,16 @@ where
mode: HeaderSyncMode, mode: HeaderSyncMode,
header_downloader: H, header_downloader: H,
consensus: Arc<dyn Consensus>, consensus: Arc<dyn Consensus>,
etl_file_size: usize,
) -> StageSetBuilder<DB> { ) -> StageSetBuilder<DB> {
StageSetBuilder::default() StageSetBuilder::default()
.add_stage(HeaderStage::new(provider, header_downloader, mode, consensus.clone())) .add_stage(HeaderStage::new(
provider,
header_downloader,
mode,
consensus.clone(),
etl_file_size,
))
.add_stage(bodies) .add_stage(bodies)
} }
} }
@ -219,6 +231,7 @@ where
self.header_downloader, self.header_downloader,
self.header_mode, self.header_mode,
self.consensus.clone(), self.consensus.clone(),
self.etl_file_size,
)) ))
.add_stage(BodyStage::new(self.body_downloader)) .add_stage(BodyStage::new(self.body_downloader))
} }

View File

@ -74,6 +74,7 @@ where
downloader: Downloader, downloader: Downloader,
mode: HeaderSyncMode, mode: HeaderSyncMode,
consensus: Arc<dyn Consensus>, consensus: Arc<dyn Consensus>,
etl_file_size: usize,
) -> Self { ) -> Self {
Self { Self {
provider: database, provider: database,
@ -81,8 +82,8 @@ where
mode, mode,
consensus, consensus,
sync_gap: None, sync_gap: None,
hash_collector: Collector::new(100 * (1024 * 1024)), hash_collector: Collector::new(etl_file_size / 2),
header_collector: Collector::new(100 * (1024 * 1024)), header_collector: Collector::new(etl_file_size / 2),
is_etl_ready: false, is_etl_ready: false,
} }
} }
@ -419,6 +420,7 @@ mod tests {
(*self.downloader_factory)(), (*self.downloader_factory)(),
HeaderSyncMode::Tip(self.channel.1.clone()), HeaderSyncMode::Tip(self.channel.1.clone()),
self.consensus.clone(), self.consensus.clone(),
500 * (1024 * 1024),
) )
} }
} }

View File

@ -32,19 +32,20 @@ pub struct TransactionLookupStage {
/// The maximum number of lookup entries to hold in memory before pushing them to /// The maximum number of lookup entries to hold in memory before pushing them to
/// [`reth_etl::Collector`]. /// [`reth_etl::Collector`].
chunk_size: u64, chunk_size: u64,
etl_file_size: usize,
prune_mode: Option<PruneMode>, prune_mode: Option<PruneMode>,
} }
impl Default for TransactionLookupStage { impl Default for TransactionLookupStage {
fn default() -> Self { fn default() -> Self {
Self { chunk_size: 5_000_000, prune_mode: None } Self { chunk_size: 5_000_000, etl_file_size: 500 * 1024 * 1024, prune_mode: None }
} }
} }
impl TransactionLookupStage { impl TransactionLookupStage {
/// Create new instance of [TransactionLookupStage]. /// Create new instance of [TransactionLookupStage].
pub fn new(chunk_size: u64, prune_mode: Option<PruneMode>) -> Self { pub fn new(chunk_size: u64, etl_file_size: usize, prune_mode: Option<PruneMode>) -> Self {
Self { chunk_size, prune_mode } Self { chunk_size, etl_file_size, prune_mode }
} }
} }
@ -99,7 +100,7 @@ impl<DB: Database> Stage<DB> for TransactionLookupStage {
} }
// 500MB temporary files // 500MB temporary files
let mut hash_collector: Collector<TxHash, TxNumber> = Collector::new(500 * (1024 * 1024)); let mut hash_collector: Collector<TxHash, TxNumber> = Collector::new(self.etl_file_size);
debug!( debug!(
target: "sync::stages::transaction_lookup", target: "sync::stages::transaction_lookup",
@ -397,12 +398,18 @@ mod tests {
struct TransactionLookupTestRunner { struct TransactionLookupTestRunner {
db: TestStageDB, db: TestStageDB,
chunk_size: u64, chunk_size: u64,
etl_file_size: usize,
prune_mode: Option<PruneMode>, prune_mode: Option<PruneMode>,
} }
impl Default for TransactionLookupTestRunner { impl Default for TransactionLookupTestRunner {
fn default() -> Self { fn default() -> Self {
Self { db: TestStageDB::default(), chunk_size: 1000, prune_mode: None } Self {
db: TestStageDB::default(),
chunk_size: 1000,
etl_file_size: 500 * 1024 * 1024,
prune_mode: None,
}
} }
} }
@ -449,7 +456,11 @@ mod tests {
} }
fn stage(&self) -> Self::S { fn stage(&self) -> Self::S {
TransactionLookupStage { chunk_size: self.chunk_size, prune_mode: self.prune_mode } TransactionLookupStage {
chunk_size: self.chunk_size,
etl_file_size: self.etl_file_size,
prune_mode: self.prune_mode,
}
} }
} }