chore: remove phf from static files (#10259)

Co-authored-by: joshieDo <93316087+joshieDo@users.noreply.github.com>
Co-authored-by: Matthias Seitz <matthias.seitz@outlook.de>
This commit is contained in:
nk_ysg
2024-08-30 15:02:14 +08:00
committed by GitHub
parent c5a1c0e131
commit 28e46bfd48
9 changed files with 40 additions and 580 deletions

View File

@ -19,7 +19,6 @@ name = "reth_nippy_jar"
reth-fs-util.workspace = true
# filter
ph = "0.8.0"
cuckoofilter = { version = "0.5.0", features = [
"serde_support",
"serde_bytes",

View File

@ -1,10 +1,8 @@
use crate::{
compression::{Compression, Compressors, Zstd},
DataReader, InclusionFilter, NippyJar, NippyJarError, NippyJarHeader, PerfectHashingFunction,
RefRow,
DataReader, NippyJar, NippyJarError, NippyJarHeader, RefRow,
};
use std::{ops::Range, sync::Arc};
use sucds::int_vectors::Access;
use zstd::bulk::Decompressor;
/// Simple cursor implementation to retrieve data from [`NippyJar`].
@ -67,35 +65,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> {
self.row = 0;
}
/// Returns a row, searching it by a key.
///
/// **May return false positives.**
///
/// Example usage would be querying a transactions file with a transaction hash which is **NOT**
/// stored in file.
pub fn row_by_key(&mut self, key: &[u8]) -> Result<Option<RefRow<'_>>, NippyJarError> {
if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) {
// TODO: is it worth to parallelize both?
// May have false positives
if filter.contains(key)? {
// May have false positives
if let Some(row_index) = phf.get_index(key)? {
self.row = self
.jar
.offsets_index
.access(row_index as usize)
.expect("built from same set") as u64;
return self.next_row()
}
}
} else {
return Err(NippyJarError::UnsupportedFilterQuery)
}
Ok(None)
}
/// Returns a row by its number.
pub fn row_by_number(&mut self, row: usize) -> Result<Option<RefRow<'_>>, NippyJarError> {
self.row = row as u64;
@ -130,40 +99,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> {
))
}
/// Returns a row, searching it by a key using a
/// `mask` to only read certain columns from the row.
///
/// **May return false positives.**
///
/// Example usage would be querying a transactions file with a transaction hash which is **NOT**
/// stored in file.
pub fn row_by_key_with_cols(
&mut self,
key: &[u8],
mask: usize,
) -> Result<Option<RefRow<'_>>, NippyJarError> {
if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) {
// TODO: is it worth to parallelize both?
// May have false positives
if filter.contains(key)? {
// May have false positives
if let Some(row_index) = phf.get_index(key)? {
self.row = self
.jar
.offsets_index
.access(row_index as usize)
.expect("built from same set") as u64;
return self.next_row_with_cols(mask)
}
}
} else {
return Err(NippyJarError::UnsupportedFilterQuery)
}
Ok(None)
}
/// Returns a row by its number by using a `mask` to only read certain columns from the row.
pub fn row_by_number_with_cols(
&mut self,

View File

@ -31,10 +31,6 @@ pub enum NippyJarError {
FilterMaxCapacity,
#[error("cuckoo was not properly initialized after loaded")]
FilterCuckooNotLoaded,
#[error("perfect hashing function doesn't have any keys added")]
PHFMissingKeys,
#[error("nippy jar initialized without perfect hashing function")]
PHFMissing,
#[error("nippy jar was built without an index")]
UnsupportedFilterQuery,
#[error("the size of an offset must be at most 8 bytes, got {offset_size}")]

View File

@ -32,9 +32,10 @@ pub mod compression;
use compression::Compression;
use compression::Compressors;
pub mod phf;
pub use phf::PHFKey;
use phf::{Fmph, Functions, GoFmph, PerfectHashingFunction};
/// empty enum for backwards compatibility
#[derive(Debug, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq, Eq))]
pub enum Functions {}
mod error;
pub use error::NippyJarError;
@ -74,24 +75,6 @@ impl<T> NippyJarHeader for T where
///
/// Data is organized into a columnar format, enabling column-based compression. Data retrieval
/// entails consulting an offset list and fetching the data from file via `mmap`.
///
/// PHF & Filters:
/// For data membership verification, the `filter` field can be configured with algorithms like
/// Bloom or Cuckoo filters. While these filters enable rapid membership checks, it's important to
/// note that **they may yield false positives but not false negatives**. Therefore, they serve as
/// preliminary checks (eg. in `by_hash` queries) and should be followed by data verification on
/// retrieval.
///
/// The `phf` (Perfect Hashing Function) and `offsets_index` fields facilitate the data retrieval
/// process in for example `by_hash` queries. Specifically, the PHF converts a query, such as a
/// block hash, into a unique integer. This integer is then used as an index in `offsets_index`,
/// which maps to the actual data location in the `offsets` list. Similar to the `filter`, the PHF
/// may also produce false positives but not false negatives, necessitating subsequent data
/// verification.
///
/// Note: that the key (eg. `BlockHash`) passed to a filter and phf does not need to actually be
/// stored.
///
/// Ultimately, the `freeze` function yields two files: a data file containing both the data and its
/// configuration, and an index file that houses the offsets and `offsets_index`.
#[derive(Serialize, Deserialize)]
@ -112,7 +95,7 @@ pub struct NippyJar<H = ()> {
/// Optional filter function for data membership checks.
filter: Option<InclusionFilters>,
#[serde(skip)]
/// Optional Perfect Hashing Function (PHF) for unique offset mapping.
/// Optional field for backwards compatibility
phf: Option<Functions>,
/// Index mapping PHF output to value offsets in `offsets`.
#[serde(skip)]
@ -196,18 +179,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
self
}
/// Adds [`phf::Fmph`] perfect hashing function.
pub fn with_fmph(mut self) -> Self {
self.phf = Some(Functions::Fmph(Fmph::new()));
self
}
/// Adds [`phf::GoFmph`] perfect hashing function.
pub fn with_gofmph(mut self) -> Self {
self.phf = Some(Functions::GoFmph(GoFmph::new()));
self
}
/// Gets a reference to the user header.
pub const fn user_header(&self) -> &H {
&self.user_header
@ -346,16 +317,6 @@ impl<H: NippyJarHeader> InclusionFilter for NippyJar<H> {
}
}
impl<H: NippyJarHeader> PerfectHashingFunction for NippyJar<H> {
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
self.phf.as_mut().ok_or(NippyJarError::PHFMissing)?.set_keys(keys)
}
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
self.phf.as_ref().ok_or(NippyJarError::PHFMissing)?.get_index(key)
}
}
#[cfg(test)]
impl<H: NippyJarHeader> NippyJar<H> {
/// If required, prepares any compression algorithm to an early pass of the data.
@ -371,55 +332,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
Ok(())
}
/// Prepares beforehand the offsets index for querying rows based on `values` (eg. transaction
/// hash). Expects `values` to be sorted in the same way as the data that is going to be
/// later on inserted.
///
/// Currently collecting all items before acting on them.
pub fn prepare_index<T: PHFKey>(
&mut self,
values: impl IntoIterator<Item = ColumnResult<T>>,
row_count: usize,
) -> Result<(), NippyJarError> {
debug!(target: "nippy-jar", ?row_count, "Preparing index.");
let values = values.into_iter().collect::<Result<Vec<_>, _>>()?;
debug_assert!(
row_count == values.len(),
"Row count ({row_count}) differs from value list count ({}).",
values.len()
);
let mut offsets_index = vec![0; row_count];
// Builds perfect hashing function from the values
if let Some(phf) = self.phf.as_mut() {
debug!(target: "nippy-jar", ?row_count, values_count = ?values.len(), "Setting keys for perfect hashing function.");
phf.set_keys(&values)?;
}
if self.filter.is_some() || self.phf.is_some() {
debug!(target: "nippy-jar", ?row_count, "Creating filter and offsets_index.");
for (row_num, v) in values.into_iter().enumerate() {
if let Some(filter) = self.filter.as_mut() {
filter.add(v.as_ref())?;
}
if let Some(phf) = self.phf.as_mut() {
// Points to the first column value offset of the row.
let index = phf.get_index(v.as_ref())?.expect("initialized") as usize;
let _ = std::mem::replace(&mut offsets_index[index], row_num as u64);
}
}
}
debug!(target: "nippy-jar", ?row_count, "Encoding offsets index list.");
self.offsets_index = PrefixSummedEliasFano::from_slice(&offsets_index)?;
Ok(())
}
/// Writes all data and configuration to a file and the offset index to another.
pub fn freeze(
self,
@ -447,7 +359,7 @@ impl<H: NippyJarHeader> NippyJar<H> {
Ok(writer.into_jar())
}
/// Freezes [`PerfectHashingFunction`], [`InclusionFilter`] and the offset index to file.
/// Freezes [`InclusionFilter`] and the offset index to file.
fn freeze_filters(&self) -> Result<(), NippyJarError> {
debug!(target: "nippy-jar", path=?self.index_path(), "Writing offsets and offsets index to file.");
@ -474,11 +386,6 @@ impl<H: NippyJarHeader> NippyJar<H> {
}
}
// Check `prepare_index` was called.
if let Some(phf) = &self.phf {
let _ = phf.get_index(&[])?;
}
Ok(())
}
}
@ -588,7 +495,7 @@ mod tests {
use super::*;
use compression::Compression;
use rand::{rngs::SmallRng, seq::SliceRandom, RngCore, SeedableRng};
use std::{collections::HashSet, fs::OpenOptions};
use std::{fs::OpenOptions, io::Read};
type ColumnResults<T> = Vec<ColumnResult<T>>;
type ColumnValues = Vec<Vec<u8>>;
@ -617,57 +524,30 @@ mod tests {
}
#[test]
fn test_phf() {
let (col1, col2) = test_data(None);
let num_columns = 2;
let num_rows = col1.len() as u64;
let file_path = tempfile::NamedTempFile::new().unwrap();
fn test_config_serialization() {
let file = tempfile::NamedTempFile::new().unwrap();
let jar = NippyJar::new_without_header(23, file.path()).with_lz4();
jar.freeze_config().unwrap();
let create_nippy = || -> NippyJar<()> {
let mut nippy = NippyJar::new_without_header(num_columns, file_path.path());
assert!(matches!(
NippyJar::set_keys(&mut nippy, &col1),
Err(NippyJarError::PHFMissing)
));
nippy
};
let mut config_file = OpenOptions::new().read(true).open(jar.config_path()).unwrap();
let config_file_len = config_file.metadata().unwrap().len();
assert_eq!(config_file_len, 37);
let check_phf = |mut nippy: NippyJar<_>| {
assert!(matches!(
NippyJar::get_index(&nippy, &col1[0]),
Err(NippyJarError::PHFMissingKeys)
));
assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok());
let mut buf = Vec::with_capacity(config_file_len as usize);
config_file.read_to_end(&mut buf).unwrap();
let collect_indexes = |nippy: &NippyJar<_>| -> Vec<u64> {
col1.iter()
.map(|value| NippyJar::get_index(nippy, value.as_slice()).unwrap().unwrap())
.collect()
};
assert_eq!(
vec![
1, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0
],
buf
);
// Ensure all indexes are unique
let indexes = collect_indexes(&nippy);
assert_eq!(indexes.iter().collect::<HashSet<_>>().len(), indexes.len());
// Ensure reproducibility
assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok());
assert_eq!(indexes, collect_indexes(&nippy));
// Ensure that loaded phf provides the same function outputs
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
let mut loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap();
loaded_nippy.load_filters().unwrap();
assert_eq!(indexes, collect_indexes(&loaded_nippy));
};
// fmph bytes size for 100 values of 32 bytes: 54
check_phf(create_nippy().with_fmph());
// fmph bytes size for 100 values of 32 bytes: 46
check_phf(create_nippy().with_gofmph());
let mut read_jar = bincode::deserialize_from::<_, NippyJar>(&buf[..]).unwrap();
// Path is not ser/de
read_jar.path = file.path().to_path_buf();
assert_eq!(jar, read_jar);
}
#[test]
@ -891,11 +771,9 @@ mod tests {
let mut nippy =
NippyJar::new(num_columns, file_path.path(), BlockJarHeader { block_start })
.with_zstd(true, 5000)
.with_cuckoo_filter(col1.len())
.with_fmph();
.with_cuckoo_filter(col1.len());
nippy.prepare_compression(data.clone()).unwrap();
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
@ -908,7 +786,6 @@ mod tests {
assert!(loaded_nippy.compressor().is_some());
assert!(loaded_nippy.filter.is_some());
assert!(loaded_nippy.phf.is_some());
assert_eq!(loaded_nippy.user_header().block_start, block_start);
if let Some(Compressors::Zstd(_zstd)) = loaded_nippy.compressor() {
@ -929,22 +806,9 @@ mod tests {
data.shuffle(&mut rand::thread_rng());
for (row_num, (v0, v1)) in data {
// Simulates `by_hash` queries by iterating col1 values, which were used to
// create the inner index.
{
let row_by_value = cursor
.row_by_key(v0)
.unwrap()
.unwrap()
.iter()
.map(|a| a.to_vec())
.collect::<Vec<_>>();
assert_eq!((&row_by_value[0], &row_by_value[1]), (v0, v1));
// Simulates `by_number` queries
let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap();
assert_eq!(row_by_value, row_by_num);
}
// Simulates `by_number` queries
let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap();
assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (v0, v1));
}
}
}
@ -962,11 +826,9 @@ mod tests {
{
let mut nippy = NippyJar::new_without_header(num_columns, file_path.path())
.with_zstd(true, 5000)
.with_cuckoo_filter(col1.len())
.with_fmph();
.with_cuckoo_filter(col1.len());
nippy.prepare_compression(data).unwrap();
nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap();
nippy
.freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows)
.unwrap();
@ -989,84 +851,41 @@ mod tests {
// Read both columns
for (row_num, (v0, v1)) in &data {
// Simulates `by_hash` queries by iterating col1 values, which were used to
// create the inner index.
let row_by_value = cursor
.row_by_key_with_cols(v0, BLOCKS_FULL_MASK)
.unwrap()
.unwrap()
.iter()
.map(|a| a.to_vec())
.collect::<Vec<_>>();
assert_eq!((&row_by_value[0], &row_by_value[1]), (*v0, *v1));
// Simulates `by_number` queries
let row_by_num = cursor
.row_by_number_with_cols(*row_num, BLOCKS_FULL_MASK)
.unwrap()
.unwrap();
assert_eq!(row_by_value, row_by_num);
assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (*v0, *v1));
}
// Read first column only: `Block`
const BLOCKS_BLOCK_MASK: usize = 0b01;
for (row_num, (v0, _)) in &data {
// Simulates `by_hash` queries by iterating col1 values, which were used to
// create the inner index.
let row_by_value = cursor
.row_by_key_with_cols(v0, BLOCKS_BLOCK_MASK)
.unwrap()
.unwrap()
.iter()
.map(|a| a.to_vec())
.collect::<Vec<_>>();
assert_eq!(row_by_value.len(), 1);
assert_eq!(&row_by_value[0], *v0);
// Simulates `by_number` queries
let row_by_num = cursor
.row_by_number_with_cols(*row_num, BLOCKS_BLOCK_MASK)
.unwrap()
.unwrap();
assert_eq!(row_by_num.len(), 1);
assert_eq!(row_by_value, row_by_num);
assert_eq!(&row_by_num[0].to_vec(), *v0);
}
// Read second column only: `Block`
const BLOCKS_WITHDRAWAL_MASK: usize = 0b10;
for (row_num, (v0, v1)) in &data {
// Simulates `by_hash` queries by iterating col1 values, which were used to
// create the inner index.
let row_by_value = cursor
.row_by_key_with_cols(v0, BLOCKS_WITHDRAWAL_MASK)
.unwrap()
.unwrap()
.iter()
.map(|a| a.to_vec())
.collect::<Vec<_>>();
assert_eq!(row_by_value.len(), 1);
assert_eq!(&row_by_value[0], *v1);
for (row_num, (_, v1)) in &data {
// Simulates `by_number` queries
let row_by_num = cursor
.row_by_number_with_cols(*row_num, BLOCKS_WITHDRAWAL_MASK)
.unwrap()
.unwrap();
assert_eq!(row_by_num.len(), 1);
assert_eq!(row_by_value, row_by_num);
assert_eq!(&row_by_num[0].to_vec(), *v1);
}
// Read nothing
const BLOCKS_EMPTY_MASK: usize = 0b00;
for (row_num, (v0, _)) in &data {
// Simulates `by_hash` queries by iterating col1 values, which were used to
// create the inner index.
assert!(cursor
.row_by_key_with_cols(v0, BLOCKS_EMPTY_MASK)
.unwrap()
.unwrap()
.is_empty());
for (row_num, _) in &data {
// Simulates `by_number` queries
assert!(cursor
.row_by_number_with_cols(*row_num, BLOCKS_EMPTY_MASK)

View File

@ -1,99 +0,0 @@
use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
use ph::fmph::{BuildConf, Function};
use serde::{
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
Serializer,
};
/// Wrapper struct for [`Function`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
#[derive(Default)]
pub struct Fmph {
function: Option<Function>,
}
impl Fmph {
pub const fn new() -> Self {
Self { function: None }
}
}
impl PerfectHashingFunction for Fmph {
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
self.function = Some(Function::from_slice_with_conf(
keys,
BuildConf { use_multiple_threads: true, ..Default::default() },
));
Ok(())
}
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
if let Some(f) = &self.function {
return Ok(f.get(key))
}
Err(NippyJarError::PHFMissingKeys)
}
}
#[cfg(test)]
impl PartialEq for Fmph {
fn eq(&self, _other: &Self) -> bool {
match (&self.function, &_other.function) {
(Some(func1), Some(func2)) => {
func1.level_sizes() == func2.level_sizes() &&
func1.write_bytes() == func2.write_bytes() &&
{
let mut f1 = Vec::with_capacity(func1.write_bytes());
func1.write(&mut f1).expect("enough capacity");
let mut f2 = Vec::with_capacity(func2.write_bytes());
func2.write(&mut f2).expect("enough capacity");
f1 == f2
}
}
(None, None) => true,
_ => false,
}
}
}
impl std::fmt::Debug for Fmph {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Fmph")
.field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes()))
.finish_non_exhaustive()
}
}
impl Serialize for Fmph {
/// Potentially expensive, but should be used only when creating the file.
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match &self.function {
Some(f) => {
let mut v = Vec::with_capacity(f.write_bytes());
f.write(&mut v).map_err(S::Error::custom)?;
serializer.serialize_some(&v)
}
None => serializer.serialize_none(),
}
}
}
impl<'de> Deserialize<'de> for Fmph {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if let Some(buffer) = <Option<Vec<u8>>>::deserialize(deserializer)? {
return Ok(Self {
function: Some(
Function::read(&mut std::io::Cursor::new(buffer)).map_err(D::Error::custom)?,
),
})
}
Ok(Self { function: None })
}
}

View File

@ -1,100 +0,0 @@
use crate::{NippyJarError, PHFKey, PerfectHashingFunction};
use ph::fmph::{GOBuildConf, GOFunction};
use serde::{
de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize,
Serializer,
};
/// Wrapper struct for [`GOFunction`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453).
#[derive(Default)]
pub struct GoFmph {
function: Option<GOFunction>,
}
impl GoFmph {
pub const fn new() -> Self {
Self { function: None }
}
}
impl PerfectHashingFunction for GoFmph {
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
self.function = Some(GOFunction::from_slice_with_conf(
keys,
GOBuildConf { use_multiple_threads: true, ..Default::default() },
));
Ok(())
}
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
if let Some(f) = &self.function {
return Ok(f.get(key))
}
Err(NippyJarError::PHFMissingKeys)
}
}
#[cfg(test)]
impl PartialEq for GoFmph {
fn eq(&self, other: &Self) -> bool {
match (&self.function, &other.function) {
(Some(func1), Some(func2)) => {
func1.level_sizes() == func2.level_sizes() &&
func1.write_bytes() == func2.write_bytes() &&
{
let mut f1 = Vec::with_capacity(func1.write_bytes());
func1.write(&mut f1).expect("enough capacity");
let mut f2 = Vec::with_capacity(func2.write_bytes());
func2.write(&mut f2).expect("enough capacity");
f1 == f2
}
}
(None, None) => true,
_ => false,
}
}
}
impl std::fmt::Debug for GoFmph {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GoFmph")
.field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes()))
.finish_non_exhaustive()
}
}
impl Serialize for GoFmph {
/// Potentially expensive, but should be used only when creating the file.
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match &self.function {
Some(f) => {
let mut v = Vec::with_capacity(f.write_bytes());
f.write(&mut v).map_err(S::Error::custom)?;
serializer.serialize_some(&v)
}
None => serializer.serialize_none(),
}
}
}
impl<'de> Deserialize<'de> for GoFmph {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if let Some(buffer) = <Option<Vec<u8>>>::deserialize(deserializer)? {
return Ok(Self {
function: Some(
GOFunction::read(&mut std::io::Cursor::new(buffer))
.map_err(D::Error::custom)?,
),
})
}
Ok(Self { function: None })
}
}

View File

@ -1,46 +0,0 @@
use crate::NippyJarError;
use serde::{Deserialize, Serialize};
use std::hash::Hash;
mod fmph;
pub use fmph::Fmph;
mod go_fmph;
pub use go_fmph::GoFmph;
/// Trait alias for [`PerfectHashingFunction`] keys.
pub trait PHFKey: AsRef<[u8]> + Sync + Clone + Hash {}
impl<T: AsRef<[u8]> + Sync + Clone + Hash> PHFKey for T {}
/// Trait to build and query a perfect hashing function.
pub trait PerfectHashingFunction: Serialize + for<'a> Deserialize<'a> {
/// Adds the key set and builds the perfect hashing function.
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError>;
/// Get corresponding associated integer. There might be false positives.
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError>;
}
/// Enumerates all types of perfect hashing functions.
#[derive(Debug, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq))]
pub enum Functions {
Fmph(Fmph),
GoFmph(GoFmph),
}
impl PerfectHashingFunction for Functions {
fn set_keys<T: PHFKey>(&mut self, keys: &[T]) -> Result<(), NippyJarError> {
match self {
Self::Fmph(f) => f.set_keys(keys),
Self::GoFmph(f) => f.set_keys(keys),
}
}
fn get_index(&self, key: &[u8]) -> Result<Option<u64>, NippyJarError> {
match self {
Self::Fmph(f) => f.get_index(key),
Self::GoFmph(f) => f.get_index(key),
}
}
}