mirror of
https://github.com/hl-archive-node/nanoreth.git
synced 2025-12-06 10:59:55 +00:00
chore: remove phf from static files (#10259)
Co-authored-by: joshieDo <93316087+joshieDo@users.noreply.github.com> Co-authored-by: Matthias Seitz <matthias.seitz@outlook.de>
This commit is contained in:
@ -19,7 +19,6 @@ name = "reth_nippy_jar"
|
||||
reth-fs-util.workspace = true
|
||||
|
||||
# filter
|
||||
ph = "0.8.0"
|
||||
cuckoofilter = { version = "0.5.0", features = [
|
||||
"serde_support",
|
||||
"serde_bytes",
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
use crate::{
|
||||
compression::{Compression, Compressors, Zstd},
|
||||
DataReader, InclusionFilter, NippyJar, NippyJarError, NippyJarHeader, PerfectHashingFunction,
|
||||
RefRow,
|
||||
DataReader, NippyJar, NippyJarError, NippyJarHeader, RefRow,
|
||||
};
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use sucds::int_vectors::Access;
|
||||
use zstd::bulk::Decompressor;
|
||||
|
||||
/// Simple cursor implementation to retrieve data from [`NippyJar`].
|
||||
@ -67,35 +65,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> {
|
||||
self.row = 0;
|
||||
}
|
||||
|
||||
/// Returns a row, searching it by a key.
|
||||
///
|
||||
/// **May return false positives.**
|
||||
///
|
||||
/// Example usage would be querying a transactions file with a transaction hash which is **NOT**
|
||||
/// stored in file.
|
||||
pub fn row_by_key(&mut self, key: &[u8]) -> Result<Option<RefRow<'_>>, NippyJarError> {
|
||||
if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) {
|
||||
// TODO: is it worth to parallelize both?
|
||||
|
||||
// May have false positives
|
||||
if filter.contains(key)? {
|
||||
// May have false positives
|
||||
if let Some(row_index) = phf.get_index(key)? {
|
||||
self.row = self
|
||||
.jar
|
||||
.offsets_index
|
||||
.access(row_index as usize)
|
||||
.expect("built from same set") as u64;
|
||||
return self.next_row()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(NippyJarError::UnsupportedFilterQuery)
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns a row by its number.
|
||||
pub fn row_by_number(&mut self, row: usize) -> Result<Option<RefRow<'_>>, NippyJarError> {
|
||||
self.row = row as u64;
|
||||
@ -130,40 +99,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> {
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns a row, searching it by a key using a
|
||||
/// `mask` to only read certain columns from the row.
|
||||
///
|
||||
/// **May return false positives.**
|
||||
///
|
||||
/// Example usage would be querying a transactions file with a transaction hash which is **NOT**
|
||||
/// stored in file.
|
||||
pub fn row_by_key_with_cols(
|
||||
&mut self,
|
||||
key: &[u8],
|
||||
mask: usize,
|
||||
) -> Result<Option<RefRow<'_>>, NippyJarError> {
|
||||
if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) {
|
||||
// TODO: is it worth to parallelize both?
|
||||
|
||||
// May have false positives
|
||||
if filter.contains(key)? {
|
||||
// May have false positives
|
||||
if let Some(row_index) = phf.get_index(key)? {
|
||||
self.row = self
|
||||
.jar
|
||||
.offsets_index
|
||||
.access(row_index as usize)
|
||||
.expect("built from same set") as u64;
|
||||
return self.next_row_with_cols(mask)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(NippyJarError::UnsupportedFilterQuery)
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Returns a row by its number by using a `mask` to only read certain columns from the row.
|
||||
pub fn row_by_number_with_cols(
|
||||
&mut self,
|
||||
|
||||
@ -31,10 +31,6 @@ pub enum NippyJarError {
|
||||
FilterMaxCapacity,
|
||||
#[error("cuckoo was not properly initialized after loaded")]
|
||||
FilterCuckooNotLoaded,
|
||||
#[error("perfect hashing function doesn't have any keys added")]
|
||||
PHFMissingKeys,
|
||||
#[error("nippy jar initialized without perfect hashing function")]
|
||||
PHFMissing,
|
||||
#[error("nippy jar was built without an index")]
|
||||
UnsupportedFilterQuery,
|
||||
#[error("the size of an offset must be at most 8 bytes, got {offset_size}")]
|
||||
|
||||
@ -32,9 +32,10 @@ pub mod compression;
|
||||
use compression::Compression;
|
||||
use compression::Compressors;
|
||||
|
||||
pub mod phf;
|
||||
pub use phf::PHFKey;
|
||||
use phf::{Fmph, Functions, GoFmph, PerfectHashingFunction};
|
||||
/// empty enum for backwards compatibility
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq, Eq))]
|
||||
pub enum Functions {}
|
||||
|
||||
mod error;
|
||||
pub use error::NippyJarError;
|
||||
@ -74,24 +75,6 @@ impl<T> NippyJarHeader for T where
|
||||
///
|
||||
/// Data is organized into a columnar format, enabling column-based compression. Data retrieval
|
||||
/// entails consulting an offset list and fetching the data from file via `mmap`.
|
||||
///
|
||||
/// PHF & Filters:
|
||||
/// For data membership verification, the `filter` field can be configured with algorithms like
|
||||
/// Bloom or Cuckoo filters. While these filters enable rapid membership checks, it's important to
|
||||
/// note that **they may yield false positives but not false negatives**. Therefore, they serve as
|
||||
/// preliminary checks (eg. in `by_hash` queries) and should be followed by data verification on
|
||||
/// retrieval.
|
||||
///
|
||||
/// The `phf` (Perfect Hashing Function) and `offsets_index` fields facilitate the data retrieval
|
||||
/// process in for example `by_hash` queries. Specifically, the PHF converts a query, such as a
|
||||
/// block hash, into a unique integer. This integer is then used as an index in `offsets_index`,
|
||||
/// which maps to the actual data location in the `offsets` list. Similar to the `filter`, the PHF
|
||||
/// may also produce false positives but not false negatives, necessitating subsequent data
|
||||
/// verification.
|
||||
///
|
||||
/// Note: that the key (eg. `BlockHash`) passed to a filter and phf does not need to actually be
|
||||
/// stored.
|
||||
///
|
||||
/// Ultimately, the `freeze` function yields two files: a data file containing both the data and its
|
||||
/// configuration, and an index file that houses the offsets and `offsets_index`.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@ -112,7 +95,7 @@ pub struct NippyJar<H = ()> {
|
||||
/// Optional filter function for data membership checks.
|
||||
filter: Option<InclusionFilters>,
|
||||
#[serde(skip)]
|
||||
/// Optional Perfect Hashing Function (PHF) for unique offset mapping.
|
||||
/// Optional field for backwards compatibility
|
||||
phf: Option<Functions>,
|
||||
/// Index mapping PHF output to value offsets in `offsets`.
|
||||
#[serde(skip)]
|
||||
@ -196,18 +179,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
|
||||
self
|
||||
}
|
||||
|
||||
/// Adds [`phf::Fmph`] perfect hashing function.
|
||||
pub fn with_fmph(mut self) -> Self {
|
||||
self.phf = Some(Functions::Fmph(Fmph::new()));
|
||||
self
|
||||
}
|
||||
|
||||
/// Adds [`phf::GoFmph`] perfect hashing function.
|
||||
pub fn with_gofmph(mut self) -> Self {
|
||||
self.phf = Some(Functions::GoFmph(GoFmph::new()));
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets a reference to the user header.
|
||||
pub const fn user_header(&self) -> &H {
|
||||
&self.user_header
|
||||
@ -346,16 +317,6 @@ impl<H: NippyJarHeader> InclusionFilter for NippyJar<H> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<H: NippyJarHeader> PerfectHashingFunction for NippyJar<H> {
|
||||
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
|
||||
self.phf.as_mut().ok_or(NippyJarError::PHFMissing)?.set_keys(keys)
|
||||
}
|
||||
|
||||
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
|
||||
self.phf.as_ref().ok_or(NippyJarError::PHFMissing)?.get_index(key)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<H: NippyJarHeader> NippyJar<H> {
|
||||
/// If required, prepares any compression algorithm to an early pass of the data.
|
||||
@ -371,55 +332,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepares beforehand the offsets index for querying rows based on `values` (eg. transaction
|
||||
/// hash). Expects `values` to be sorted in the same way as the data that is going to be
|
||||
/// later on inserted.
|
||||
///
|
||||
/// Currently collecting all items before acting on them.
|
||||
pub fn prepare_index<T: PHFKey>(
|
||||
&mut self,
|
||||
values: impl IntoIterator<Item = ColumnResult<T>>,
|
||||
row_count: usize,
|
||||
) -> Result<(), NippyJarError> {
|
||||
debug!(target: "nippy-jar", ?row_count, "Preparing index.");
|
||||
|
||||
let values = values.into_iter().collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
debug_assert!(
|
||||
row_count == values.len(),
|
||||
"Row count ({row_count}) differs from value list count ({}).",
|
||||
values.len()
|
||||
);
|
||||
|
||||
let mut offsets_index = vec![0; row_count];
|
||||
|
||||
// Builds perfect hashing function from the values
|
||||
if let Some(phf) = self.phf.as_mut() {
|
||||
debug!(target: "nippy-jar", ?row_count, values_count = ?values.len(), "Setting keys for perfect hashing function.");
|
||||
phf.set_keys(&values)?;
|
||||
}
|
||||
|
||||
if self.filter.is_some() || self.phf.is_some() {
|
||||
debug!(target: "nippy-jar", ?row_count, "Creating filter and offsets_index.");
|
||||
|
||||
for (row_num, v) in values.into_iter().enumerate() {
|
||||
if let Some(filter) = self.filter.as_mut() {
|
||||
filter.add(v.as_ref())?;
|
||||
}
|
||||
|
||||
if let Some(phf) = self.phf.as_mut() {
|
||||
// Points to the first column value offset of the row.
|
||||
let index = phf.get_index(v.as_ref())?.expect("initialized") as usize;
|
||||
let _ = std::mem::replace(&mut offsets_index[index], row_num as u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(target: "nippy-jar", ?row_count, "Encoding offsets index list.");
|
||||
self.offsets_index = PrefixSummedEliasFano::from_slice(&offsets_index)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Writes all data and configuration to a file and the offset index to another.
|
||||
pub fn freeze(
|
||||
self,
|
||||
@ -447,7 +359,7 @@ impl<H: NippyJarHeader> NippyJar<H> {
|
||||
Ok(writer.into_jar())
|
||||
}
|
||||
|
||||
/// Freezes [`PerfectHashingFunction`], [`InclusionFilter`] and the offset index to file.
|
||||
/// Freezes [`InclusionFilter`] and the offset index to file.
|
||||
fn freeze_filters(&self) -> Result<(), NippyJarError> {
|
||||
debug!(target: "nippy-jar", path=?self.index_path(), "Writing offsets and offsets index to file.");
|
||||
|
||||
@ -474,11 +386,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
|
||||
}
|
||||
}
|
||||
|
||||
// Check `prepare_index` was called.
|
||||
if let Some(phf) = &self.phf {
|
||||
let _ = phf.get_index(&[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@ -588,7 +495,7 @@ mod tests {
|
||||
use super::*;
|
||||
use compression::Compression;
|
||||
use rand::{rngs::SmallRng, seq::SliceRandom, RngCore, SeedableRng};
|
||||
use std::{collections::HashSet, fs::OpenOptions};
|
||||
use std::{fs::OpenOptions, io::Read};
|
||||
|
||||
type ColumnResults<T> = Vec<ColumnResult<T>>;
|
||||
type ColumnValues = Vec<Vec<u8>>;
|
||||
@ -617,57 +524,30 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phf() {
|
||||
let (col1, col2) = test_data(None);
|
||||
let num_columns = 2;
|
||||
let num_rows = col1.len() as u64;
|
||||
let file_path = tempfile::NamedTempFile::new().unwrap();
|
||||
fn test_config_serialization() {
|
||||
let file = tempfile::NamedTempFile::new().unwrap();
|
||||
let jar = NippyJar::new_without_header(23, file.path()).with_lz4();
|
||||
jar.freeze_config().unwrap();
|
||||
|
||||
let create_nippy = || -> NippyJar<()> {
|
||||
let mut nippy = NippyJar::new_without_header(num_columns, file_path.path());
|
||||
assert!(matches!(
|
||||
NippyJar::set_keys(&mut nippy, &col1),
|
||||
Err(NippyJarError::PHFMissing)
|
||||
));
|
||||
nippy
|
||||
};
|
||||
let mut config_file = OpenOptions::new().read(true).open(jar.config_path()).unwrap();
|
||||
let config_file_len = config_file.metadata().unwrap().len();
|
||||
assert_eq!(config_file_len, 37);
|
||||
|
||||
let check_phf = |mut nippy: NippyJar<_>| {
|
||||
assert!(matches!(
|
||||
NippyJar::get_index(&nippy, &col1[0]),
|
||||
Err(NippyJarError::PHFMissingKeys)
|
||||
));
|
||||
assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok());
|
||||
let mut buf = Vec::with_capacity(config_file_len as usize);
|
||||
config_file.read_to_end(&mut buf).unwrap();
|
||||
|
||||
let collect_indexes = |nippy: &NippyJar<_>| -> Vec<u64> {
|
||||
col1.iter()
|
||||
.map(|value| NippyJar::get_index(nippy, value.as_slice()).unwrap().unwrap())
|
||||
.collect()
|
||||
};
|
||||
assert_eq!(
|
||||
vec![
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
],
|
||||
buf
|
||||
);
|
||||
|
||||
// Ensure all indexes are unique
|
||||
let indexes = collect_indexes(&nippy);
|
||||
assert_eq!(indexes.iter().collect::<HashSet<_>>().len(), indexes.len());
|
||||
|
||||
// Ensure reproducibility
|
||||
assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok());
|
||||
assert_eq!(indexes, collect_indexes(&nippy));
|
||||
|
||||
// Ensure that loaded phf provides the same function outputs
|
||||
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
|
||||
nippy
|
||||
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
|
||||
.unwrap();
|
||||
let mut loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
|
||||
loaded_nippy.load_filters().unwrap();
|
||||
assert_eq!(indexes, collect_indexes(&loaded_nippy));
|
||||
};
|
||||
|
||||
// fmph bytes size for 100 values of 32 bytes: 54
|
||||
check_phf(create_nippy().with_fmph());
|
||||
|
||||
// fmph bytes size for 100 values of 32 bytes: 46
|
||||
check_phf(create_nippy().with_gofmph());
|
||||
let mut read_jar = bincode::deserialize_from::<_, NippyJar>(&buf[..]).unwrap();
|
||||
// Path is not ser/de
|
||||
read_jar.path = file.path().to_path_buf();
|
||||
assert_eq!(jar, read_jar);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -891,11 +771,9 @@ mod tests {
|
||||
let mut nippy =
|
||||
NippyJar::new(num_columns, file_path.path(), BlockJarHeader { block_start })
|
||||
.with_zstd(true, 5000)
|
||||
.with_cuckoo_filter(col1.len())
|
||||
.with_fmph();
|
||||
.with_cuckoo_filter(col1.len());
|
||||
|
||||
nippy.prepare_compression(data.clone()).unwrap();
|
||||
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
|
||||
nippy
|
||||
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
|
||||
.unwrap();
|
||||
@ -908,7 +786,6 @@ mod tests {
|
||||
|
||||
assert!(loaded_nippy.compressor().is_some());
|
||||
assert!(loaded_nippy.filter.is_some());
|
||||
assert!(loaded_nippy.phf.is_some());
|
||||
assert_eq!(loaded_nippy.user_header().block_start, block_start);
|
||||
|
||||
if let Some(Compressors::Zstd(_zstd)) = loaded_nippy.compressor() {
|
||||
@ -929,22 +806,9 @@ mod tests {
|
||||
data.shuffle(&mut rand::thread_rng());
|
||||
|
||||
for (row_num, (v0, v1)) in data {
|
||||
// Simulates `by_hash` queries by iterating col1 values, which were used to
|
||||
// create the inner index.
|
||||
{
|
||||
let row_by_value = cursor
|
||||
.row_by_key(v0)
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|a| a.to_vec())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!((&row_by_value[0], &row_by_value[1]), (v0, v1));
|
||||
|
||||
// Simulates `by_number` queries
|
||||
let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap();
|
||||
assert_eq!(row_by_value, row_by_num);
|
||||
}
|
||||
// Simulates `by_number` queries
|
||||
let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap();
|
||||
assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (v0, v1));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -962,11 +826,9 @@ mod tests {
|
||||
{
|
||||
let mut nippy = NippyJar::new_without_header(num_columns, file_path.path())
|
||||
.with_zstd(true, 5000)
|
||||
.with_cuckoo_filter(col1.len())
|
||||
.with_fmph();
|
||||
.with_cuckoo_filter(col1.len());
|
||||
|
||||
nippy.prepare_compression(data).unwrap();
|
||||
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
|
||||
nippy
|
||||
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
|
||||
.unwrap();
|
||||
@ -989,84 +851,41 @@ mod tests {
|
||||
|
||||
// Read both columns
|
||||
for (row_num, (v0, v1)) in &data {
|
||||
// Simulates `by_hash` queries by iterating col1 values, which were used to
|
||||
// create the inner index.
|
||||
let row_by_value = cursor
|
||||
.row_by_key_with_cols(v0, BLOCKS_FULL_MASK)
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|a| a.to_vec())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!((&row_by_value[0], &row_by_value[1]), (*v0, *v1));
|
||||
|
||||
// Simulates `by_number` queries
|
||||
let row_by_num = cursor
|
||||
.row_by_number_with_cols(*row_num, BLOCKS_FULL_MASK)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(row_by_value, row_by_num);
|
||||
assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (*v0, *v1));
|
||||
}
|
||||
|
||||
// Read first column only: `Block`
|
||||
const BLOCKS_BLOCK_MASK: usize = 0b01;
|
||||
for (row_num, (v0, _)) in &data {
|
||||
// Simulates `by_hash` queries by iterating col1 values, which were used to
|
||||
// create the inner index.
|
||||
let row_by_value = cursor
|
||||
.row_by_key_with_cols(v0, BLOCKS_BLOCK_MASK)
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|a| a.to_vec())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(row_by_value.len(), 1);
|
||||
assert_eq!(&row_by_value[0], *v0);
|
||||
|
||||
// Simulates `by_number` queries
|
||||
let row_by_num = cursor
|
||||
.row_by_number_with_cols(*row_num, BLOCKS_BLOCK_MASK)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(row_by_num.len(), 1);
|
||||
assert_eq!(row_by_value, row_by_num);
|
||||
assert_eq!(&row_by_num[0].to_vec(), *v0);
|
||||
}
|
||||
|
||||
// Read second column only: `Block`
|
||||
const BLOCKS_WITHDRAWAL_MASK: usize = 0b10;
|
||||
for (row_num, (v0, v1)) in &data {
|
||||
// Simulates `by_hash` queries by iterating col1 values, which were used to
|
||||
// create the inner index.
|
||||
let row_by_value = cursor
|
||||
.row_by_key_with_cols(v0, BLOCKS_WITHDRAWAL_MASK)
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|a| a.to_vec())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(row_by_value.len(), 1);
|
||||
assert_eq!(&row_by_value[0], *v1);
|
||||
|
||||
for (row_num, (_, v1)) in &data {
|
||||
// Simulates `by_number` queries
|
||||
let row_by_num = cursor
|
||||
.row_by_number_with_cols(*row_num, BLOCKS_WITHDRAWAL_MASK)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(row_by_num.len(), 1);
|
||||
assert_eq!(row_by_value, row_by_num);
|
||||
assert_eq!(&row_by_num[0].to_vec(), *v1);
|
||||
}
|
||||
|
||||
// Read nothing
|
||||
const BLOCKS_EMPTY_MASK: usize = 0b00;
|
||||
for (row_num, (v0, _)) in &data {
|
||||
// Simulates `by_hash` queries by iterating col1 values, which were used to
|
||||
// create the inner index.
|
||||
assert!(cursor
|
||||
.row_by_key_with_cols(v0, BLOCKS_EMPTY_MASK)
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.is_empty());
|
||||
|
||||
for (row_num, _) in &data {
|
||||
// Simulates `by_number` queries
|
||||
assert!(cursor
|
||||
.row_by_number_with_cols(*row_num, BLOCKS_EMPTY_MASK)
|
||||
|
||||
@ -1,99 +0,0 @@
|
||||
use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
|
||||
use ph::fmph::{BuildConf, Function};
|
||||
use serde::{
|
||||
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
|
||||
Serializer,
|
||||
};
|
||||
|
||||
/// Wrapper struct for [`Function`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
|
||||
#[derive(Default)]
|
||||
pub struct Fmph {
|
||||
function: Option<Function>,
|
||||
}
|
||||
|
||||
impl Fmph {
|
||||
pub const fn new() -> Self {
|
||||
Self { function: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl PerfectHashingFunction for Fmph {
|
||||
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
|
||||
self.function = Some(Function::from_slice_with_conf(
|
||||
keys,
|
||||
BuildConf { use_multiple_threads: true, ..Default::default() },
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
|
||||
if let Some(f) = &self.function {
|
||||
return Ok(f.get(key))
|
||||
}
|
||||
Err(NippyJarError::PHFMissingKeys)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl PartialEq for Fmph {
|
||||
fn eq(&self, _other: &Self) -> bool {
|
||||
match (&self.function, &_other.function) {
|
||||
(Some(func1), Some(func2)) => {
|
||||
func1.level_sizes() == func2.level_sizes() &&
|
||||
func1.write_bytes() == func2.write_bytes() &&
|
||||
{
|
||||
let mut f1 = Vec::with_capacity(func1.write_bytes());
|
||||
func1.write(&mut f1).expect("enough capacity");
|
||||
|
||||
let mut f2 = Vec::with_capacity(func2.write_bytes());
|
||||
func2.write(&mut f2).expect("enough capacity");
|
||||
|
||||
f1 == f2
|
||||
}
|
||||
}
|
||||
(None, None) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Fmph {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Fmph")
|
||||
.field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes()))
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Fmph {
|
||||
/// Potentially expensive, but should be used only when creating the file.
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match &self.function {
|
||||
Some(f) => {
|
||||
let mut v = Vec::with_capacity(f.write_bytes());
|
||||
f.write(&mut v).map_err(S::Error::custom)?;
|
||||
serializer.serialize_some(&v)
|
||||
}
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Fmph {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
if let Some(buffer) = <Option<Vec<u8>>>::deserialize(deserializer)? {
|
||||
return Ok(Self {
|
||||
function: Some(
|
||||
Function::read(&mut std::io::Cursor::new(buffer)).map_err(D::Error::custom)?,
|
||||
),
|
||||
})
|
||||
}
|
||||
Ok(Self { function: None })
|
||||
}
|
||||
}
|
||||
@ -1,100 +0,0 @@
|
||||
use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
|
||||
use ph::fmph::{GOBuildConf, GOFunction};
|
||||
use serde::{
|
||||
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
|
||||
Serializer,
|
||||
};
|
||||
|
||||
/// Wrapper struct for [`GOFunction`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
|
||||
#[derive(Default)]
|
||||
pub struct GoFmph {
|
||||
function: Option<GOFunction>,
|
||||
}
|
||||
|
||||
impl GoFmph {
|
||||
pub const fn new() -> Self {
|
||||
Self { function: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl PerfectHashingFunction for GoFmph {
|
||||
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
|
||||
self.function = Some(GOFunction::from_slice_with_conf(
|
||||
keys,
|
||||
GOBuildConf { use_multiple_threads: true, ..Default::default() },
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
|
||||
if let Some(f) = &self.function {
|
||||
return Ok(f.get(key))
|
||||
}
|
||||
Err(NippyJarError::PHFMissingKeys)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl PartialEq for GoFmph {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match (&self.function, &other.function) {
|
||||
(Some(func1), Some(func2)) => {
|
||||
func1.level_sizes() == func2.level_sizes() &&
|
||||
func1.write_bytes() == func2.write_bytes() &&
|
||||
{
|
||||
let mut f1 = Vec::with_capacity(func1.write_bytes());
|
||||
func1.write(&mut f1).expect("enough capacity");
|
||||
|
||||
let mut f2 = Vec::with_capacity(func2.write_bytes());
|
||||
func2.write(&mut f2).expect("enough capacity");
|
||||
|
||||
f1 == f2
|
||||
}
|
||||
}
|
||||
(None, None) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for GoFmph {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("GoFmph")
|
||||
.field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes()))
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for GoFmph {
|
||||
/// Potentially expensive, but should be used only when creating the file.
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match &self.function {
|
||||
Some(f) => {
|
||||
let mut v = Vec::with_capacity(f.write_bytes());
|
||||
f.write(&mut v).map_err(S::Error::custom)?;
|
||||
serializer.serialize_some(&v)
|
||||
}
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for GoFmph {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
if let Some(buffer) = <Option<Vec<u8>>>::deserialize(deserializer)? {
|
||||
return Ok(Self {
|
||||
function: Some(
|
||||
GOFunction::read(&mut std::io::Cursor::new(buffer))
|
||||
.map_err(D::Error::custom)?,
|
||||
),
|
||||
})
|
||||
}
|
||||
Ok(Self { function: None })
|
||||
}
|
||||
}
|
||||
@ -1,46 +0,0 @@
|
||||
use crate::NippyJarError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::hash::Hash;
|
||||
|
||||
mod fmph;
|
||||
pub use fmph::Fmph;
|
||||
|
||||
mod go_fmph;
|
||||
pub use go_fmph::GoFmph;
|
||||
|
||||
/// Trait alias for [`PerfectHashingFunction`] keys.
|
||||
pub trait PHFKey: AsRef<[u8]> + Sync + Clone + Hash {}
|
||||
impl<T: AsRef<[u8]> + Sync + Clone + Hash> PHFKey for T {}
|
||||
|
||||
/// Trait to build and query a perfect hashing function.
|
||||
pub trait PerfectHashingFunction: Serialize + for<'a> Deserialize<'a> {
|
||||
/// Adds the key set and builds the perfect hashing function.
|
||||
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError>;
|
||||
|
||||
/// Get corresponding associated integer. There might be false positives.
|
||||
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError>;
|
||||
}
|
||||
|
||||
/// Enumerates all types of perfect hashing functions.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub enum Functions {
|
||||
Fmph(Fmph),
|
||||
GoFmph(GoFmph),
|
||||
}
|
||||
|
||||
impl PerfectHashingFunction for Functions {
|
||||
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
|
||||
match self {
|
||||
Self::Fmph(f) => f.set_keys(keys),
|
||||
Self::GoFmph(f) => f.set_keys(keys),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
|
||||
match self {
|
||||
Self::Fmph(f) => f.get_index(key),
|
||||
Self::GoFmph(f) => f.get_index(key),
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user