feat: create a NippyJar snapshot from multiple Table (#4716)

This commit is contained in:
joshieDo
2023-09-27 15:13:18 +01:00
committed by GitHub
parent ac6570fa7a
commit 0cdd30251b
19 changed files with 409 additions and 59 deletions

4
Cargo.lock generated
View File

@ -5527,6 +5527,7 @@ dependencies = [
"reth-interfaces", "reth-interfaces",
"reth-libmdbx", "reth-libmdbx",
"reth-metrics", "reth-metrics",
"reth-nippy-jar",
"reth-primitives", "reth-primitives",
"secp256k1", "secp256k1",
"serde", "serde",
@ -5697,6 +5698,7 @@ dependencies = [
"reth-db", "reth-db",
"reth-eth-wire", "reth-eth-wire",
"reth-network-api", "reth-network-api",
"reth-nippy-jar",
"reth-primitives", "reth-primitives",
"reth-rpc-types", "reth-rpc-types",
"revm-primitives", "revm-primitives",
@ -5971,9 +5973,11 @@ dependencies = [
"itertools 0.11.0", "itertools 0.11.0",
"parking_lot 0.12.1", "parking_lot 0.12.1",
"pin-project", "pin-project",
"rand 0.8.5",
"rayon", "rayon",
"reth-db", "reth-db",
"reth-interfaces", "reth-interfaces",
"reth-nippy-jar",
"reth-primitives", "reth-primitives",
"reth-revm-primitives", "reth-revm-primitives",
"reth-rlp", "reth-rlp",

View File

@ -6,7 +6,7 @@ use reth_consensus_common::validation::validate_block_standalone;
use reth_db::{ use reth_db::{
cursor::DbCursorRO, cursor::DbCursorRO,
database::Database, database::Database,
table::{Table, TableRow}, table::{Decode, Decompress, Table, TableRow},
transaction::{DbTx, DbTxMut}, transaction::{DbTx, DbTxMut},
DatabaseError, RawTable, TableRawRow, DatabaseError, RawTable, TableRawRow,
}; };
@ -128,16 +128,22 @@ impl<'a, DB: Database> DbTool<'a, DB> {
let map_filter = |row: Result<TableRawRow<T>, _>| { let map_filter = |row: Result<TableRawRow<T>, _>| {
if let Ok((k, v)) = row { if let Ok((k, v)) = row {
let (key, value) = (k.into_key(), v.into_value());
let result = || { let result = || {
if filter.only_count { if filter.only_count {
return None return None
} }
Some((k.key().unwrap(), v.value().unwrap())) Some((
<T as Table>::Key::decode(&key).unwrap(),
<T as Table>::Value::decompress(&value).unwrap(),
))
}; };
match &*bmb { match &*bmb {
Some(searcher) => { Some(searcher) => {
if searcher.find_first_in(v.raw_value()).is_some() || if searcher.find_first_in(&value).is_some() ||
searcher.find_first_in(k.raw_key()).is_some() searcher.find_first_in(&key).is_some()
{ {
hits += 1; hits += 1;
return result() return result()

View File

@ -9,6 +9,7 @@ repository.workspace = true
[dependencies] [dependencies]
reth-codecs = { path = "../storage/codecs" } reth-codecs = { path = "../storage/codecs" }
reth-nippy-jar = { path = "../storage/nippy-jar" }
reth-primitives.workspace = true reth-primitives.workspace = true
reth-rpc-types.workspace = true reth-rpc-types.workspace = true
reth-network-api.workspace = true reth-network-api.workspace = true

View File

@ -29,3 +29,9 @@ pub enum RethError {
#[error("{0}")] #[error("{0}")]
Custom(String), Custom(String),
} }
impl From<reth_nippy_jar::NippyJarError> for RethError {
fn from(err: reth_nippy_jar::NippyJarError) -> Self {
RethError::Custom(err.to_string())
}
}

View File

@ -14,6 +14,7 @@ reth-primitives.workspace = true
reth-interfaces.workspace = true reth-interfaces.workspace = true
reth-codecs = { path = "../codecs" } reth-codecs = { path = "../codecs" }
reth-libmdbx = { path = "../libmdbx-rs", optional = true, features = ["return-borrowed"] } reth-libmdbx = { path = "../libmdbx-rs", optional = true, features = ["return-borrowed"] }
reth-nippy-jar = { path = "../nippy-jar" }
# codecs # codecs
serde = { workspace = true, default-features = false } serde = { workspace = true, default-features = false }
@ -42,6 +43,7 @@ tempfile = { version = "3.3.0", optional = true }
parking_lot.workspace = true parking_lot.workspace = true
derive_more = "0.99" derive_more = "0.99"
eyre = "0.6.8" eyre = "0.6.8"
paste = "1.0"
# arbitrary utils # arbitrary utils
arbitrary = { workspace = true, features = ["derive"], optional = true } arbitrary = { workspace = true, features = ["derive"], optional = true }

View File

@ -35,6 +35,11 @@ pub trait Compress: Send + Sync + Sized + Debug {
pub trait Decompress: Send + Sync + Sized + Debug { pub trait Decompress: Send + Sync + Sized + Debug {
/// Decompresses data coming from the database. /// Decompresses data coming from the database.
fn decompress<B: AsRef<[u8]>>(value: B) -> Result<Self, DatabaseError>; fn decompress<B: AsRef<[u8]>>(value: B) -> Result<Self, DatabaseError>;
/// Decompresses owned data coming from the database.
fn decompress_owned(value: Vec<u8>) -> Result<Self, DatabaseError> {
Self::decompress(value)
}
} }
/// Trait that will transform the data to be saved in the DB. /// Trait that will transform the data to be saved in the DB.

View File

@ -68,6 +68,7 @@
pub mod abstraction; pub mod abstraction;
mod implementation; mod implementation;
pub mod snapshot;
pub mod tables; pub mod tables;
mod utils; mod utils;
pub mod version; pub mod version;

View File

@ -0,0 +1,87 @@
//! reth's snapshot creation from database tables
use crate::{
abstraction::cursor::DbCursorRO,
table::{Key, Table},
transaction::DbTx,
RawKey, RawTable,
};
use reth_interfaces::RethResult;
use reth_nippy_jar::{ColumnResult, NippyJar, PHFKey};
use std::{error::Error as StdError, ops::RangeInclusive};
/// Macro that generates snapshot creation functions that take an arbitratry number of [`Table`] and
/// creates a [`NippyJar`] file out of their [`Table::Value`]. Each list of [`Table::Value`] from a
/// table is a column of values.
///
/// Has membership filter set and compression dictionary support.
macro_rules! generate_snapshot_func {
($(($($tbl:ident),+)),+ $(,)? ) => {
$(
paste::item! {
/// Creates a snapshot from specified tables. Each table's `Value` iterator represents a column.
///
/// **Ensure the range contains the same number of rows.**
///
/// * `tx`: Database transaction.
/// * `range`: Data range for columns in tables.
/// * `keys`: Iterator of keys (eg. `TxHash` or `BlockHash`) with length equal to `row_count` and ordered by future column insertion from `range`.
/// * `dict_compression_set`: Sets of column data for compression dictionaries. Max size is 2GB. Row count is independent.
/// * `row_count`: Total rows to add to `NippyJar`. Must match row count in `range`.
/// * `nippy_jar`: Snapshot object responsible for file generation.
#[allow(non_snake_case)]
pub fn [<create_snapshot$(_ $tbl)+>]<'tx,
$($tbl: Table<Key=K>,)+
K
>
(
tx: &impl DbTx<'tx>,
range: RangeInclusive<K>,
dict_compression_set: Option<Vec<impl Iterator<Item = Vec<u8>>>>,
keys: Option<impl Iterator<Item = ColumnResult<impl PHFKey>>>,
row_count: usize,
nippy_jar: &mut NippyJar
) -> RethResult<()>
where K: Key + Copy
{
let range: RangeInclusive<RawKey<K>> = RawKey::new(*range.start())..=RawKey::new(*range.end());
// Create PHF and Filter if required
if let Some(keys) = keys {
nippy_jar.prepare_index(keys, row_count)?;
}
// Create compression dictionaries if required
if let Some(data_sets) = dict_compression_set {
nippy_jar.prepare_compression(data_sets)?;
}
// Creates the cursors for the columns
$(
let mut [< $tbl _cursor>] = tx.cursor_read::<RawTable<$tbl>>()?;
let [< $tbl _iter>] = [< $tbl _cursor>]
.walk_range(range.clone())?
.into_iter()
.map(|row|
row
.map(|(_key, val)| val.into_value())
.map_err(|e| Box::new(e) as Box<dyn StdError + Send + Sync>)
);
)+
// Create the snapshot from the data
let col_iterators: Vec<Box<dyn Iterator<Item = Result<Vec<u8>,_>>>> = vec![
$(Box::new([< $tbl _iter>]),)+
];
nippy_jar.freeze(col_iterators, row_count as u64)?;
Ok(())
}
}
)+
};
}
generate_snapshot_func!((T1), (T1, T2), (T1, T2, T3), (T1, T2, T3, T4),);

View File

@ -54,14 +54,21 @@ impl<K: Key> RawKey<K> {
pub fn new(key: K) -> Self { pub fn new(key: K) -> Self {
Self { key: K::encode(key).as_ref().to_vec(), _phantom: std::marker::PhantomData } Self { key: K::encode(key).as_ref().to_vec(), _phantom: std::marker::PhantomData }
} }
/// Returns the decoded value. /// Returns the decoded value.
pub fn key(&self) -> Result<K, DatabaseError> { pub fn key(&self) -> Result<K, DatabaseError> {
K::decode(&self.key) K::decode(&self.key)
} }
/// Returns the raw key as seen on the database. /// Returns the raw key as seen on the database.
pub fn raw_key(&self) -> &Vec<u8> { pub fn raw_key(&self) -> &Vec<u8> {
&self.key &self.key
} }
/// Consumes [`Self`] and returns the inner raw key.
pub fn into_key(self) -> Vec<u8> {
self.key
}
} }
impl<K: Key> From<K> for RawKey<K> { impl<K: Key> From<K> for RawKey<K> {
@ -105,14 +112,21 @@ impl<V: Value> RawValue<V> {
pub fn new(value: V) -> Self { pub fn new(value: V) -> Self {
Self { value: V::compress(value).as_ref().to_vec(), _phantom: std::marker::PhantomData } Self { value: V::compress(value).as_ref().to_vec(), _phantom: std::marker::PhantomData }
} }
/// Returns the decompressed value. /// Returns the decompressed value.
pub fn value(&self) -> Result<V, DatabaseError> { pub fn value(&self) -> Result<V, DatabaseError> {
V::decompress(&self.value) V::decompress(&self.value)
} }
/// Returns the raw value as seen on the database. /// Returns the raw value as seen on the database.
pub fn raw_value(&self) -> &Vec<u8> { pub fn raw_value(&self) -> &[u8] {
&self.value &self.value
} }
/// Consumes [`Self`] and returns the inner raw value.
pub fn into_value(self) -> Vec<u8> {
self.value
}
} }
impl AsRef<[u8]> for RawValue<Vec<u8>> { impl AsRef<[u8]> for RawValue<Vec<u8>> {
@ -142,4 +156,8 @@ impl<V: Value> Decompress for RawValue<V> {
fn decompress<B: AsRef<[u8]>>(value: B) -> Result<Self, DatabaseError> { fn decompress<B: AsRef<[u8]>>(value: B) -> Result<Self, DatabaseError> {
Ok(Self { value: value.as_ref().to_vec(), _phantom: std::marker::PhantomData }) Ok(Self { value: value.as_ref().to_vec(), _phantom: std::marker::PhantomData })
} }
fn decompress_owned(value: Vec<u8>) -> Result<Self, DatabaseError> {
Ok(Self { value, _phantom: std::marker::PhantomData })
}
} }

View File

@ -54,7 +54,7 @@ where
}; };
let value = match kv.1 { let value = match kv.1 {
Cow::Borrowed(v) => Decompress::decompress(v)?, Cow::Borrowed(v) => Decompress::decompress(v)?,
Cow::Owned(v) => Decompress::decompress(v)?, Cow::Owned(v) => Decompress::decompress_owned(v)?,
}; };
Ok((key, value)) Ok((key, value))
} }
@ -68,7 +68,7 @@ where
{ {
Ok(match kv.1 { Ok(match kv.1 {
Cow::Borrowed(v) => Decompress::decompress(v)?, Cow::Borrowed(v) => Decompress::decompress(v)?,
Cow::Owned(v) => Decompress::decompress(v)?, Cow::Owned(v) => Decompress::decompress_owned(v)?,
}) })
} }
@ -79,6 +79,6 @@ where
{ {
Ok(match value { Ok(match value {
Cow::Borrowed(v) => Decompress::decompress(v)?, Cow::Borrowed(v) => Decompress::decompress(v)?,
Cow::Owned(v) => Decompress::decompress(v)?, Cow::Owned(v) => Decompress::decompress_owned(v)?,
}) })
} }

View File

@ -9,7 +9,7 @@ use sucds::int_vectors::Access;
use zstd::bulk::Decompressor; use zstd::bulk::Decompressor;
/// Simple cursor implementation to retrieve data from [`NippyJar`]. /// Simple cursor implementation to retrieve data from [`NippyJar`].
pub struct NippyJarCursor<'a, H> { pub struct NippyJarCursor<'a, H = ()> {
/// [`NippyJar`] which holds most of the required configuration to read from the file. /// [`NippyJar`] which holds most of the required configuration to read from the file.
jar: &'a NippyJar<H>, jar: &'a NippyJar<H>,
/// Optional dictionary decompressors. /// Optional dictionary decompressors.

View File

@ -3,6 +3,8 @@ use thiserror::Error;
/// Errors associated with [`crate::NippyJar`]. /// Errors associated with [`crate::NippyJar`].
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum NippyJarError { pub enum NippyJarError {
#[error(transparent)]
Internal(#[from] Box<dyn std::error::Error + Send + Sync>),
#[error(transparent)] #[error(transparent)]
Disconnect(#[from] std::io::Error), Disconnect(#[from] std::io::Error),
#[error(transparent)] #[error(transparent)]

View File

@ -13,8 +13,8 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::{ use std::{
clone::Clone, clone::Clone,
error::Error as StdError,
fs::File, fs::File,
hash::Hash,
io::{Seek, Write}, io::{Seek, Write},
marker::Sync, marker::Sync,
path::{Path, PathBuf}, path::{Path, PathBuf},
@ -32,6 +32,7 @@ pub mod compression;
use compression::{Compression, Compressors}; use compression::{Compression, Compressors};
pub mod phf; pub mod phf;
pub use phf::PHFKey;
use phf::{Fmph, Functions, GoFmph, PerfectHashingFunction}; use phf::{Fmph, Functions, GoFmph, PerfectHashingFunction};
mod error; mod error;
@ -45,6 +46,9 @@ const NIPPY_JAR_VERSION: usize = 1;
/// A [`Row`] is a list of its selected column values. /// A [`Row`] is a list of its selected column values.
type Row = Vec<Vec<u8>>; type Row = Vec<Vec<u8>>;
/// Alias type for a column value wrapped in `Result`
pub type ColumnResult<T> = Result<T, Box<dyn StdError + Send + Sync>>;
/// `NippyJar` is a specialized storage format designed for immutable data. /// `NippyJar` is a specialized storage format designed for immutable data.
/// ///
/// Data is organized into a columnar format, enabling column-based compression. Data retrieval /// Data is organized into a columnar format, enabling column-based compression. Data retrieval
@ -106,6 +110,11 @@ impl NippyJar<()> {
pub fn load_without_header(path: &Path) -> Result<Self, NippyJarError> { pub fn load_without_header(path: &Path) -> Result<Self, NippyJarError> {
NippyJar::<()>::load(path) NippyJar::<()>::load(path)
} }
/// Whether this [`NippyJar`] uses a [`InclusionFilters`] and [`Functions`].
pub fn uses_filters(&self) -> bool {
self.filter.is_some() && self.phf.is_some()
}
} }
impl<H> NippyJar<H> impl<H> NippyJar<H>
@ -210,19 +219,23 @@ where
/// Prepares beforehand the offsets index for querying rows based on `values` (eg. transaction /// Prepares beforehand the offsets index for querying rows based on `values` (eg. transaction
/// hash). Expects `values` to be sorted in the same way as the data that is going to be /// hash). Expects `values` to be sorted in the same way as the data that is going to be
/// later on inserted. /// later on inserted.
pub fn prepare_index<T: AsRef<[u8]> + Sync + Clone + Hash>( ///
/// Currently collecting all items before acting on them.
pub fn prepare_index<T: PHFKey>(
&mut self, &mut self,
values: &[T], values: impl IntoIterator<Item = ColumnResult<T>>,
row_count: usize,
) -> Result<(), NippyJarError> { ) -> Result<(), NippyJarError> {
let mut offsets_index = vec![0; values.len()]; let values = values.into_iter().collect::<Result<Vec<_>, _>>()?;
let mut offsets_index = vec![0; row_count];
// Builds perfect hashing function from the values // Builds perfect hashing function from the values
if let Some(phf) = self.phf.as_mut() { if let Some(phf) = self.phf.as_mut() {
phf.set_keys(values)?; phf.set_keys(&values)?;
} }
if self.filter.is_some() || self.phf.is_some() { if self.filter.is_some() || self.phf.is_some() {
for (row_num, v) in values.iter().enumerate() { for (row_num, v) in values.into_iter().enumerate() {
if let Some(filter) = self.filter.as_mut() { if let Some(filter) = self.filter.as_mut() {
filter.add(v.as_ref())?; filter.add(v.as_ref())?;
} }
@ -242,7 +255,7 @@ where
/// Writes all data and configuration to a file and the offset index to another. /// Writes all data and configuration to a file and the offset index to another.
pub fn freeze( pub fn freeze(
&mut self, &mut self,
columns: Vec<impl IntoIterator<Item = Vec<u8>>>, columns: Vec<impl IntoIterator<Item = ColumnResult<Vec<u8>>>>,
total_rows: u64, total_rows: u64,
) -> Result<(), NippyJarError> { ) -> Result<(), NippyJarError> {
let mut file = self.freeze_check(&columns)?; let mut file = self.freeze_check(&columns)?;
@ -275,7 +288,7 @@ where
offsets.push(file.stream_position()? as usize); offsets.push(file.stream_position()? as usize);
match column_iter.next() { match column_iter.next() {
Some(value) => { Some(Ok(value)) => {
if let Some(compression) = &self.compressor { if let Some(compression) = &self.compressor {
// Special zstd case with dictionaries // Special zstd case with dictionaries
if let (Some(dict_compressors), Compressors::Zstd(_)) = if let (Some(dict_compressors), Compressors::Zstd(_)) =
@ -300,6 +313,7 @@ where
column_number as u64, column_number as u64,
)) ))
} }
Some(Err(err)) => return Err(err.into()),
} }
iterators.push(column_iter); iterators.push(column_iter);
@ -339,7 +353,7 @@ where
/// Safety checks before creating and returning a [`File`] handle to write data to. /// Safety checks before creating and returning a [`File`] handle to write data to.
fn freeze_check( fn freeze_check(
&mut self, &mut self,
columns: &Vec<impl IntoIterator<Item = Vec<u8>>>, columns: &Vec<impl IntoIterator<Item = ColumnResult<Vec<u8>>>>,
) -> Result<File, NippyJarError> { ) -> Result<File, NippyJarError> {
if columns.len() != self.columns { if columns.len() != self.columns {
return Err(NippyJarError::ColumnLenMismatch(self.columns, columns.len())) return Err(NippyJarError::ColumnLenMismatch(self.columns, columns.len()))
@ -384,10 +398,7 @@ impl<H> PerfectHashingFunction for NippyJar<H>
where where
H: Send + Sync + Serialize + for<'a> Deserialize<'a>, H: Send + Sync + Serialize + for<'a> Deserialize<'a>,
{ {
fn set_keys<T: AsRef<[u8]> + Sync + Clone + Hash>( fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
&mut self,
keys: &[T],
) -> Result<(), NippyJarError> {
self.phf.as_mut().ok_or(NippyJarError::PHFMissing)?.set_keys(keys) self.phf.as_mut().ok_or(NippyJarError::PHFMissing)?.set_keys(keys)
} }
@ -402,6 +413,7 @@ mod tests {
use rand::{rngs::SmallRng, seq::SliceRandom, RngCore, SeedableRng}; use rand::{rngs::SmallRng, seq::SliceRandom, RngCore, SeedableRng};
use std::collections::HashSet; use std::collections::HashSet;
type ColumnResults<T> = Vec<ColumnResult<T>>;
type ColumnValues = Vec<Vec<u8>>; type ColumnValues = Vec<Vec<u8>>;
fn test_data(seed: Option<u64>) -> (ColumnValues, ColumnValues) { fn test_data(seed: Option<u64>) -> (ColumnValues, ColumnValues) {
@ -423,6 +435,10 @@ mod tests {
(gen(), gen()) (gen(), gen())
} }
fn clone_with_result(col: &ColumnValues) -> ColumnResults<Vec<u8>> {
col.iter().map(|v| Ok(v.clone())).collect()
}
#[test] #[test]
fn test_phf() { fn test_phf() {
let (col1, col2) = test_data(None); let (col1, col2) = test_data(None);
@ -455,8 +471,10 @@ mod tests {
assert_eq!(indexes, collect_indexes(nippy)); assert_eq!(indexes, collect_indexes(nippy));
// Ensure that loaded phf provides the same function outputs // Ensure that loaded phf provides the same function outputs
nippy.prepare_index(&col1).unwrap(); nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy.freeze(vec![col1.clone(), col2.clone()], num_rows).unwrap(); nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap(); let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
assert_eq!(indexes, collect_indexes(&loaded_nippy)); assert_eq!(indexes, collect_indexes(&loaded_nippy));
}; };
@ -504,7 +522,7 @@ mod tests {
Err(NippyJarError::FilterMaxCapacity) Err(NippyJarError::FilterMaxCapacity)
)); ));
nippy.freeze(vec![col1.clone(), col2.clone()], num_rows).unwrap(); nippy.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows).unwrap();
let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap(); let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
assert_eq!(nippy, loaded_nippy); assert_eq!(nippy, loaded_nippy);
@ -540,16 +558,14 @@ mod tests {
)); ));
} }
let data = vec![col1.clone(), col2.clone()];
// If ZSTD is enabled, do not write to the file unless the column dictionaries have been // If ZSTD is enabled, do not write to the file unless the column dictionaries have been
// calculated. // calculated.
assert!(matches!( assert!(matches!(
nippy.freeze(data.clone(), num_rows), nippy.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows),
Err(NippyJarError::CompressorNotReady) Err(NippyJarError::CompressorNotReady)
)); ));
nippy.prepare_compression(data.clone()).unwrap(); nippy.prepare_compression(vec![col1.clone(), col2.clone()]).unwrap();
if let Some(Compressors::Zstd(zstd)) = &nippy.compressor { if let Some(Compressors::Zstd(zstd)) = &nippy.compressor {
assert!(matches!( assert!(matches!(
@ -558,7 +574,7 @@ mod tests {
)); ));
} }
nippy.freeze(data.clone(), num_rows).unwrap(); nippy.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows).unwrap();
let mut loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap(); let mut loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
assert_eq!(nippy, loaded_nippy); assert_eq!(nippy, loaded_nippy);
@ -578,7 +594,7 @@ mod tests {
// Iterate over compressed values and compare // Iterate over compressed values and compare
let mut row_index = 0usize; let mut row_index = 0usize;
while let Some(row) = cursor.next_row().unwrap() { while let Some(row) = cursor.next_row().unwrap() {
assert_eq!((&row[0], &row[1]), (&data[0][row_index], &data[1][row_index])); assert_eq!((&row[0], &row[1]), (&col1[row_index], &col2[row_index]));
row_index += 1; row_index += 1;
} }
} }
@ -598,9 +614,7 @@ mod tests {
NippyJar::new_without_header(num_columns, file_path.path()).with_zstd(false, 5000); NippyJar::new_without_header(num_columns, file_path.path()).with_zstd(false, 5000);
assert!(nippy.compressor.is_some()); assert!(nippy.compressor.is_some());
let data = vec![col1.clone(), col2.clone()]; nippy.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows).unwrap();
nippy.freeze(data.clone(), num_rows).unwrap();
let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap(); let loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
assert_eq!(nippy, loaded_nippy); assert_eq!(nippy, loaded_nippy);
@ -613,7 +627,7 @@ mod tests {
// Iterate over compressed values and compare // Iterate over compressed values and compare
let mut row_index = 0usize; let mut row_index = 0usize;
while let Some(row) = cursor.next_row().unwrap() { while let Some(row) = cursor.next_row().unwrap() {
assert_eq!((&row[0], &row[1]), (&data[0][row_index], &data[1][row_index])); assert_eq!((&row[0], &row[1]), (&col1[row_index], &col2[row_index]));
row_index += 1; row_index += 1;
} }
} else { } else {
@ -629,6 +643,7 @@ mod tests {
let num_columns = 2; let num_columns = 2;
let file_path = tempfile::NamedTempFile::new().unwrap(); let file_path = tempfile::NamedTempFile::new().unwrap();
let data = vec![col1.clone(), col2.clone()]; let data = vec![col1.clone(), col2.clone()];
let block_start = 500; let block_start = 500;
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
@ -645,8 +660,10 @@ mod tests {
.with_mphf(); .with_mphf();
nippy.prepare_compression(data.clone()).unwrap(); nippy.prepare_compression(data.clone()).unwrap();
nippy.prepare_index(&col1).unwrap(); nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy.freeze(data.clone(), num_rows).unwrap(); nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
} }
// Read file // Read file
@ -710,8 +727,10 @@ mod tests {
.with_mphf(); .with_mphf();
nippy.prepare_compression(data.clone()).unwrap(); nippy.prepare_compression(data.clone()).unwrap();
nippy.prepare_index(&col1).unwrap(); nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy.freeze(data.clone(), num_rows).unwrap(); nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
} }
// Read file // Read file

View File

@ -1,10 +1,9 @@
use crate::{NippyJarError, PerfectHashingFunction}; use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
use ph::fmph::{BuildConf, Function}; use ph::fmph::{BuildConf, Function};
use serde::{ use serde::{
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize, de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
Serializer, Serializer,
}; };
use std::{clone::Clone, hash::Hash, marker::Sync};
/// Wrapper struct for [`Function`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453). /// Wrapper struct for [`Function`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
#[derive(Default)] #[derive(Default)]
@ -19,10 +18,7 @@ impl Fmph {
} }
impl PerfectHashingFunction for Fmph { impl PerfectHashingFunction for Fmph {
fn set_keys<T: AsRef<[u8]> + Sync + Clone + Hash>( fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
&mut self,
keys: &[T],
) -> Result<(), NippyJarError> {
self.function = Some(Function::from_slice_with_conf( self.function = Some(Function::from_slice_with_conf(
keys, keys,
BuildConf { use_multiple_threads: true, ..Default::default() }, BuildConf { use_multiple_threads: true, ..Default::default() },

View File

@ -1,10 +1,9 @@
use crate::{NippyJarError, PerfectHashingFunction}; use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
use ph::fmph::{GOBuildConf, GOFunction}; use ph::fmph::{GOBuildConf, GOFunction};
use serde::{ use serde::{
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize, de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
Serializer, Serializer,
}; };
use std::{clone::Clone, hash::Hash, marker::Sync};
/// Wrapper struct for [`GOFunction`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453). /// Wrapper struct for [`GOFunction`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
#[derive(Default)] #[derive(Default)]
@ -19,10 +18,7 @@ impl GoFmph {
} }
impl PerfectHashingFunction for GoFmph { impl PerfectHashingFunction for GoFmph {
fn set_keys<T: AsRef<[u8]> + Sync + Clone + Hash>( fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
&mut self,
keys: &[T],
) -> Result<(), NippyJarError> {
self.function = Some(GOFunction::from_slice_with_conf( self.function = Some(GOFunction::from_slice_with_conf(
keys, keys,
GOBuildConf { use_multiple_threads: true, ..Default::default() }, GOBuildConf { use_multiple_threads: true, ..Default::default() },

View File

@ -8,13 +8,14 @@ pub use fmph::Fmph;
mod go_fmph; mod go_fmph;
pub use go_fmph::GoFmph; pub use go_fmph::GoFmph;
/// Trait alias for [`PerfectHashingFunction`] keys.
pub trait PHFKey: AsRef<[u8]> + Sync + Clone + Hash {}
impl<T: AsRef<[u8]> + Sync + Clone + Hash> PHFKey for T {}
/// Trait to build and query a perfect hashing function. /// Trait to build and query a perfect hashing function.
pub trait PerfectHashingFunction: Serialize + for<'a> Deserialize<'a> { pub trait PerfectHashingFunction: Serialize + for<'a> Deserialize<'a> {
/// Adds the key set and builds the perfect hashing function. /// Adds the key set and builds the perfect hashing function.
fn set_keys<T: AsRef<[u8]> + Sync + Clone + Hash>( fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError>;
&mut self,
keys: &[T],
) -> Result<(), NippyJarError>;
/// Get corresponding associated integer. There might be false positives. /// Get corresponding associated integer. There might be false positives.
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError>; fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError>;
@ -29,10 +30,7 @@ pub enum Functions {
} }
impl PerfectHashingFunction for Functions { impl PerfectHashingFunction for Functions {
fn set_keys<T: AsRef<[u8]> + Sync + Clone + Hash>( fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
&mut self,
keys: &[T],
) -> Result<(), NippyJarError> {
match self { match self {
Functions::Fmph(f) => f.set_keys(keys), Functions::Fmph(f) => f.set_keys(keys),
Functions::GoFmph(f) => f.set_keys(keys), Functions::GoFmph(f) => f.set_keys(keys),

View File

@ -15,6 +15,7 @@ reth-interfaces.workspace = true
reth-revm-primitives = { path = "../../revm/revm-primitives" } reth-revm-primitives = { path = "../../revm/revm-primitives" }
reth-db.workspace = true reth-db.workspace = true
reth-trie = { path = "../../trie" } reth-trie = { path = "../../trie" }
reth-nippy-jar = { path = "../nippy-jar" }
# async # async
tokio = { workspace = true, features = ["sync", "macros", "rt-multi-thread"] } tokio = { workspace = true, features = ["sync", "macros", "rt-multi-thread"] }
@ -45,6 +46,7 @@ reth-interfaces = { workspace = true, features = ["test-utils"] }
parking_lot.workspace = true parking_lot.workspace = true
tempfile = "3.3" tempfile = "3.3"
assert_matches.workspace = true assert_matches.workspace = true
rand.workspace = true
[features] [features]
test-utils = ["reth-rlp"] test-utils = ["reth-rlp"]

View File

@ -35,6 +35,8 @@ use tracing::trace;
mod bundle_state_provider; mod bundle_state_provider;
mod chain_info; mod chain_info;
mod database; mod database;
mod snapshot;
pub use snapshot::SnapshotProvider;
mod state; mod state;
use crate::{providers::chain_info::ChainInfoTracker, traits::BlockSource}; use crate::{providers::chain_info::ChainInfoTracker, traits::BlockSource};
pub use bundle_state_provider::BundleStateProvider; pub use bundle_state_provider::BundleStateProvider;

View File

@ -0,0 +1,205 @@
use crate::HeaderProvider;
use reth_db::{
table::{Decompress, Table},
HeaderTD,
};
use reth_interfaces::RethResult;
use reth_nippy_jar::{NippyJar, NippyJarCursor};
use reth_primitives::{BlockHash, BlockNumber, Header, SealedHeader, U256};
use std::ops::RangeBounds;
/// SnapshotProvider
///
/// WIP Rudimentary impl just for testes
/// TODO: should be able to walk through snapshot files/block_ranges
/// TODO: Arc over NippyJars and/or NippyJarCursors (LRU)
#[derive(Debug)]
pub struct SnapshotProvider<'a> {
/// NippyJar
pub jar: &'a NippyJar,
}
impl<'a> SnapshotProvider<'a> {
fn cursor(&self) -> NippyJarCursor<'a> {
NippyJarCursor::new(self.jar, None).unwrap()
}
}
impl<'a> HeaderProvider for SnapshotProvider<'a> {
fn header(&self, block_hash: &BlockHash) -> RethResult<Option<Header>> {
// WIP
let mut cursor = self.cursor();
let header = Header::decompress(
&cursor.row_by_key_with_cols::<0b01, 2>(&block_hash.0).unwrap().unwrap()[0],
)
.unwrap();
if &header.hash_slow() == block_hash {
return Ok(Some(header))
} else {
// check next snapshot
}
Ok(None)
}
fn header_by_number(&self, _num: BlockNumber) -> RethResult<Option<Header>> {
unimplemented!();
}
fn header_td(&self, block_hash: &BlockHash) -> RethResult<Option<U256>> {
// WIP
let mut cursor = self.cursor();
let row = cursor.row_by_key_with_cols::<0b11, 2>(&block_hash.0).unwrap().unwrap();
let header = Header::decompress(&row[0]).unwrap();
let td = <HeaderTD as Table>::Value::decompress(&row[1]).unwrap();
if &header.hash_slow() == block_hash {
return Ok(Some(td.0))
} else {
// check next snapshot
}
Ok(None)
}
fn header_td_by_number(&self, _number: BlockNumber) -> RethResult<Option<U256>> {
unimplemented!();
}
fn headers_range(&self, _range: impl RangeBounds<BlockNumber>) -> RethResult<Vec<Header>> {
unimplemented!();
}
fn sealed_headers_range(
&self,
_range: impl RangeBounds<BlockNumber>,
) -> RethResult<Vec<SealedHeader>> {
unimplemented!();
}
fn sealed_header(&self, _number: BlockNumber) -> RethResult<Option<SealedHeader>> {
unimplemented!();
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::ProviderFactory;
use rand::{self, seq::SliceRandom};
use reth_db::{
cursor::DbCursorRO,
database::Database,
snapshot::create_snapshot_T1_T2,
test_utils::create_test_rw_db,
transaction::{DbTx, DbTxMut},
CanonicalHeaders, DatabaseError, HeaderNumbers, HeaderTD, Headers, RawTable,
};
use reth_interfaces::test_utils::generators::{self, random_header_range};
use reth_nippy_jar::NippyJar;
use reth_primitives::{H256, MAINNET};
#[test]
fn test_snap() {
// Ranges
let row_count = 100u64;
let range = 0..=(row_count - 1);
// Data sources
let db = create_test_rw_db();
let factory = ProviderFactory::new(&db, MAINNET.clone());
let snap_file = tempfile::NamedTempFile::new().unwrap();
// Setup data
let mut headers = random_header_range(
&mut generators::rng(),
*range.start()..(*range.end() + 1),
H256::random(),
);
db.update(|tx| -> std::result::Result<(), DatabaseError> {
let mut td = U256::ZERO;
for header in headers.clone() {
td += header.header.difficulty;
let hash = header.hash();
tx.put::<CanonicalHeaders>(header.number, hash)?;
tx.put::<Headers>(header.number, header.clone().unseal())?;
tx.put::<HeaderTD>(header.number, td.into())?;
tx.put::<HeaderNumbers>(hash, header.number)?;
}
Ok(())
})
.unwrap()
.unwrap();
// Create Snapshot
{
let with_compression = true;
let with_filter = true;
let mut nippy_jar = NippyJar::new_without_header(2, snap_file.path());
if with_compression {
nippy_jar = nippy_jar.with_zstd(false, 0);
}
if with_filter {
nippy_jar = nippy_jar.with_cuckoo_filter(row_count as usize + 10).with_mphf();
}
let tx = db.tx().unwrap();
// Hacky type inference. TODO fix
let mut none_vec = Some(vec![vec![vec![0u8]].into_iter()]);
let _ = none_vec.take();
// Generate list of hashes for filters & PHF
let mut cursor = tx.cursor_read::<RawTable<CanonicalHeaders>>().unwrap();
let hashes = cursor
.walk(None)
.unwrap()
.map(|row| row.map(|(_key, value)| value.into_value()).map_err(|e| e.into()));
create_snapshot_T1_T2::<Headers, HeaderTD, BlockNumber>(
&tx,
range,
none_vec,
Some(hashes),
row_count as usize,
&mut nippy_jar,
)
.unwrap();
}
// Use providers to query Header data and compare if it matches
{
let jar = NippyJar::load_without_header(snap_file.path()).unwrap();
let db_provider = factory.provider().unwrap();
let snap_provider = SnapshotProvider { jar: &jar };
assert!(!headers.is_empty());
// Shuffled for chaos.
headers.shuffle(&mut generators::rng());
for header in headers {
let header_hash = header.hash();
let header = header.unseal();
// Compare Header
assert_eq!(header, db_provider.header(&header_hash).unwrap().unwrap());
assert_eq!(header, snap_provider.header(&header_hash).unwrap().unwrap());
// Compare HeaderTD
assert_eq!(
db_provider.header_td(&header_hash).unwrap().unwrap(),
snap_provider.header_td(&header_hash).unwrap().unwrap()
);
}
}
}
}