From 5c55c57c5e575320d996d3ab0394c4cdf8215b5c Mon Sep 17 00:00:00 2001 From: rakita Date: Tue, 25 Oct 2022 11:50:02 +0200 Subject: [PATCH] feat(db): switch to reth-libmdbx-rs, bump reth-mdbx-sys to 0.12.1 (#133) * feat(db): Add mdbx-rs apache licenced code 55e234 * feat(db): replace mdbx with reth-mdbx, metadata changes * chore(db): bump mdbx-sys to 0.12.1 --- Cargo.lock | 61 +- Cargo.toml | 4 +- crates/db/Cargo.toml | 2 +- crates/db/src/kv/cursor.rs | 4 +- crates/db/src/kv/mod.rs | 6 +- crates/db/src/kv/tx.rs | 2 +- crates/db/src/lib.rs | 2 +- crates/libmdbx-rs/Cargo.lock | 1012 ++ crates/libmdbx-rs/Cargo.toml | 19 +- crates/libmdbx-rs/benches/cursor.rs | 2 +- crates/libmdbx-rs/benches/transaction.rs | 2 +- crates/libmdbx-rs/benches/utils.rs | 2 +- crates/libmdbx-rs/mdbx-sys/Cargo.toml | 18 +- .../mdbx-sys/libmdbx/CMakeLists.txt | 15 +- .../libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md | 128 +- .../libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile | 36 +- crates/libmdbx-rs/mdbx-sys/libmdbx/README.md | 30 +- .../libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt | 2 +- .../libmdbx-rs/mdbx-sys/libmdbx/config.h.in | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_chk.1 | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_copy.1 | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_drop.1 | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_dump.1 | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_load.1 | 2 +- .../mdbx-sys/libmdbx/man1/mdbx_stat.1 | 2 +- crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c | 14131 ++++++++-------- crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ | 1037 +- crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h | 301 +- crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ | 58 +- crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c | 739 +- .../libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c | 693 +- .../libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c | 693 +- .../libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c | 711 +- .../libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c | 701 +- .../libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c | 697 +- crates/libmdbx-rs/src/lib.rs | 17 +- crates/libmdbx-rs/tests/cursor.rs | 4 +- crates/libmdbx-rs/tests/environment.rs | 4 +- crates/libmdbx-rs/tests/transaction.rs | 4 +- 39 files changed, 11740 insertions(+), 9411 deletions(-) create mode 100644 crates/libmdbx-rs/Cargo.lock diff --git a/Cargo.lock b/Cargo.lock index 57464690e..1f13b6638 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,9 +196,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.60.1" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" +checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" dependencies = [ "bitflags", "cexpr", @@ -211,6 +211,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", + "syn", ] [[package]] @@ -1797,19 +1798,12 @@ dependencies = [ ] [[package]] -name = "libmdbx" -version = "0.1.8" +name = "lifetimed-bytes" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c535ae90a96067a09087672f6eac8a7f010ab7d577037cf2e99714c2dceb1" +checksum = "4c970c8ea4c7b023a41cfa4af4c785a16694604c2f2a3b0d1f20a9bcb73fa550" dependencies = [ - "bitflags", - "byteorder", - "derive_more", - "indexmap", - "libc", - "mdbx-sys", - "parking_lot", - "thiserror", + "bytes", ] [[package]] @@ -1843,17 +1837,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" -[[package]] -name = "mdbx-sys" -version = "0.12.1-0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd34990bd2d486fb7dda34ce57cbcd393d46afebb985a39c8ad84f6c70220aa" -dependencies = [ - "bindgen", - "cc", - "libc", -] - [[package]] name = "memchr" version = "2.5.0" @@ -2485,11 +2468,11 @@ dependencies = [ "criterion", "eyre", "iai", - "libmdbx", "page_size", "parity-scale-codec", "postcard", "reth-interfaces", + "reth-libmdbx", "reth-primitives", "serde", "tempfile", @@ -2604,6 +2587,34 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "reth-libmdbx" +version = "0.1.6" +dependencies = [ + "bitflags", + "byteorder", + "criterion", + "derive_more", + "indexmap", + "libc", + "lifetimed-bytes", + "parking_lot", + "rand", + "rand_xorshift", + "reth-mdbx-sys", + "tempfile", + "thiserror", +] + +[[package]] +name = "reth-mdbx-sys" +version = "0.12.1-0" +dependencies = [ + "bindgen", + "cc", + "libc", +] + [[package]] name = "reth-p2p" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 03d007127..263f7839a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ members = [ "crates/primitives", "crates/stages", "crates/transaction-pool", - "crates/db" + "crates/db", + "crates/libmdbx-rs", + "crates/libmdbx-rs/mdbx-sys" ] default-members = ["bin/reth"] diff --git a/crates/db/Cargo.toml b/crates/db/Cargo.toml index 7ecb1605a..7d71a62bf 100644 --- a/crates/db/Cargo.toml +++ b/crates/db/Cargo.toml @@ -11,6 +11,7 @@ description = "Staged syncing primitives used in reth." # reth reth-primitives = { path = "../primitives" } reth-interfaces = { path = "../interfaces" } +reth-libmdbx = { path = "../libmdbx-rs" } # codecs serde = { version = "1.0.*", default-features = false } @@ -19,7 +20,6 @@ parity-scale-codec = { version = "3.2.1", features = ["bytes"] } # misc bytes = "1.2.1" -libmdbx = "0.1.8" page_size = "0.4.2" thiserror = "1.0.37" tempfile = { version = "3.3.0", optional = true } diff --git a/crates/db/src/kv/cursor.rs b/crates/db/src/kv/cursor.rs index 9e8a24af5..c8a037f62 100644 --- a/crates/db/src/kv/cursor.rs +++ b/crates/db/src/kv/cursor.rs @@ -3,7 +3,7 @@ use std::marker::PhantomData; use crate::utils::*; -use libmdbx::{self, TransactionKind, WriteFlags, RO, RW}; +use reth_libmdbx::{self, TransactionKind, WriteFlags, RO, RW}; use reth_interfaces::db::{ DbCursorRO, DbCursorRW, DbDupCursorRO, DbDupCursorRW, DupSort, DupWalker, Encode, Error, Table, Walker, @@ -25,7 +25,7 @@ pub type CursorRW<'tx, T> = Cursor<'tx, RW, T>; #[derive(Debug)] pub struct Cursor<'tx, K: TransactionKind, T: Table> { /// Inner `libmdbx` cursor. - pub inner: libmdbx::Cursor<'tx, K>, + pub inner: reth_libmdbx::Cursor<'tx, K>, /// Table name as is inside the database. pub table: &'static str, /// Phantom data to enforce encoding/decoding. diff --git a/crates/db/src/kv/mod.rs b/crates/db/src/kv/mod.rs index e8d55616c..0764d751c 100644 --- a/crates/db/src/kv/mod.rs +++ b/crates/db/src/kv/mod.rs @@ -1,7 +1,7 @@ //! Module that interacts with MDBX. use crate::utils::default_page_size; -use libmdbx::{ +use reth_libmdbx::{ DatabaseFlags, Environment, EnvironmentFlags, EnvironmentKind, Geometry, Mode, PageSize, SyncMode, RO, RW, }; @@ -99,7 +99,7 @@ impl Env { } impl Deref for Env { - type Target = libmdbx::Environment; + type Target = reth_libmdbx::Environment; fn deref(&self) -> &Self::Target { &self.inner @@ -134,7 +134,7 @@ pub mod test_utils { #[cfg(test)] mod tests { use super::{test_utils, Env, EnvKind}; - use libmdbx::{NoWriteMap, WriteMap}; + use reth_libmdbx::{NoWriteMap, WriteMap}; use reth_interfaces::db::{ tables::{Headers, PlainAccountState, PlainStorageState}, Database, DbCursorRO, DbDupCursorRO, DbTx, DbTxMut, diff --git a/crates/db/src/kv/tx.rs b/crates/db/src/kv/tx.rs index 1958f2c1f..9a401a1e0 100644 --- a/crates/db/src/kv/tx.rs +++ b/crates/db/src/kv/tx.rs @@ -1,7 +1,7 @@ //! Transaction wrapper for libmdbx-sys. use crate::{kv::cursor::Cursor, utils::decode_one}; -use libmdbx::{EnvironmentKind, Transaction, TransactionKind, WriteFlags, RW}; +use reth_libmdbx::{EnvironmentKind, Transaction, TransactionKind, WriteFlags, RW}; use reth_interfaces::db::{DbTx, DbTxGAT, DbTxMut, DbTxMutGAT, DupSort, Encode, Error, Table}; use std::marker::PhantomData; diff --git a/crates/db/src/lib.rs b/crates/db/src/lib.rs index 89cf267d5..524643097 100644 --- a/crates/db/src/lib.rs +++ b/crates/db/src/lib.rs @@ -9,7 +9,7 @@ /// Rust bindings for [MDBX](https://libmdbx.dqdkfa.ru/). pub mod mdbx { - pub use libmdbx::*; + pub use reth_libmdbx::*; } pub mod kv; diff --git a/crates/libmdbx-rs/Cargo.lock b/crates/libmdbx-rs/Cargo.lock new file mode 100644 index 000000000..18c3cf63d --- /dev/null +++ b/crates/libmdbx-rs/Cargo.lock @@ -0,0 +1,1012 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bindgen" +version = "0.60.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "criterion" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "indexmap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" + +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55edcf6c0bb319052dea84732cf99db461780fd5e8d3eb46ab6ff312ab31f197" + +[[package]] +name = "libloading" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "lifetimed-bytes" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c970c8ea4c7b023a41cfa4af4c785a16694604c2f2a3b0d1f20a9bcb73fa550" +dependencies = [ + "bytes", +] + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "mdbx-sys" +version = "0.11.8-0" +dependencies = [ + "bindgen", + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc9e0dc2adc1c69d09143aff38d3d30c5c3f0df0dad82e6d25547af174ebec0" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "plotters" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" + +[[package]] +name = "plotters-svg" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + +[[package]] +name = "proc-macro2" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rayon" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "reth-libmdbx" +version = "0.1.6" +dependencies = [ + "bitflags", + "byteorder", + "criterion", + "derive_more", + "indexmap", + "libc", + "lifetimed-bytes", + "mdbx-sys", + "parking_lot", + "rand", + "rand_xorshift", + "tempfile", + "thiserror", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "semver" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" + +[[package]] +name = "serde" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce777b7b150d76b9cf60d28b55f5847135a003f7d7350c6be7a773508ce7d45" +dependencies = [ + "itoa 1.0.4", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "syn" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" diff --git a/crates/libmdbx-rs/Cargo.toml b/crates/libmdbx-rs/Cargo.toml index c0bf08e06..fc355d23e 100644 --- a/crates/libmdbx-rs/Cargo.toml +++ b/crates/libmdbx-rs/Cargo.toml @@ -1,21 +1,14 @@ [package] -name = "libmdbx" +name = "reth-libmdbx" version = "0.1.6" edition = "2021" license = "Apache-2.0" -description = "Idiomatic and safe MDBX wrapper." -documentation = "https://docs.rs/libmdbx" -homepage = "https://github.com/vorot93/libmdbx-rs" -repository = "https://github.com/vorot93/libmdbx-rs" +description = "Idiomatic and safe MDBX wrapper with good licence" +repository = "https://github.com/foundry-rs/reth" readme = "README.md" -keywords = ["LMDB", "MDBX", "database", "storage-engine", "bindings"] -categories = ["database"] [lib] -name = "libmdbx" - -[workspace] -members = ["mdbx-sys"] +name = "reth_libmdbx" [dependencies] bitflags = "1" @@ -26,12 +19,12 @@ libc = "0.2" parking_lot = "0.12" thiserror = "1" -ffi = { package = "mdbx-sys", version = "=0.11.8-0", path = "./mdbx-sys" } +ffi = { package = "reth-mdbx-sys", path = "./mdbx-sys" } lifetimed-bytes = { version = "0.1", optional = true } [dev-dependencies] -criterion = "0.3" +criterion = "0.4" rand = "0.8" rand_xorshift = "0.3" tempfile = "3" diff --git a/crates/libmdbx-rs/benches/cursor.rs b/crates/libmdbx-rs/benches/cursor.rs index 41e830dd9..d316ea677 100644 --- a/crates/libmdbx-rs/benches/cursor.rs +++ b/crates/libmdbx-rs/benches/cursor.rs @@ -2,7 +2,7 @@ mod utils; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use ffi::*; -use libmdbx::*; +use reth_libmdbx::*; use std::ptr; use utils::*; diff --git a/crates/libmdbx-rs/benches/transaction.rs b/crates/libmdbx-rs/benches/transaction.rs index 3c9de48ce..fd8761fb6 100644 --- a/crates/libmdbx-rs/benches/transaction.rs +++ b/crates/libmdbx-rs/benches/transaction.rs @@ -3,7 +3,7 @@ mod utils; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use ffi::*; use libc::size_t; -use libmdbx::{ObjectLength, WriteFlags}; +use reth_libmdbx::{ObjectLength, WriteFlags}; use rand::{prelude::SliceRandom, SeedableRng}; use rand_xorshift::XorShiftRng; use std::ptr; diff --git a/crates/libmdbx-rs/benches/utils.rs b/crates/libmdbx-rs/benches/utils.rs index 673b3127a..18032e78c 100644 --- a/crates/libmdbx-rs/benches/utils.rs +++ b/crates/libmdbx-rs/benches/utils.rs @@ -1,4 +1,4 @@ -use libmdbx::{Environment, NoWriteMap, WriteFlags}; +use reth_libmdbx::{Environment, NoWriteMap, WriteFlags}; use tempfile::{tempdir, TempDir}; pub fn get_key(n: u32) -> String { diff --git a/crates/libmdbx-rs/mdbx-sys/Cargo.toml b/crates/libmdbx-rs/mdbx-sys/Cargo.toml index 6d901ed36..521499461 100644 --- a/crates/libmdbx-rs/mdbx-sys/Cargo.toml +++ b/crates/libmdbx-rs/mdbx-sys/Cargo.toml @@ -1,22 +1,18 @@ [package] -name = "mdbx-sys" -version = "0.11.8-0" +name = "reth-mdbx-sys" +version = "0.12.1-0" edition = "2021" license = "Apache-2.0" -description = "Rust bindings for libmdbx." -documentation = "https://docs.rs/mdbx-sys" -homepage = "https://github.com/vorot93/libmdbx-rs" -repository = "https://github.com/vorot93/libmdbx-rs" -readme = "../README.md" -keywords = ["MDBX", "database", "storage-engine", "bindings", "library"] -categories = ["database", "external-ffi-bindings"] +description = "Rust bindings for libmdbx with good licence." +repository = "https://github.com/foundry-rs/reth" +readme = "README.md" [lib] -name = "mdbx_sys" +name = "reth_mdbx_sys" [dependencies] libc = "0.2" [build-dependencies] cc = "1.0" -bindgen = { version = "0.60", default-features = false, features = ["runtime"] } +bindgen = { version = "0.61", default-features = false, features = ["runtime"] } diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt b/crates/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt index 387d60d7f..0b9155aa1 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt @@ -233,6 +233,7 @@ if(SUBPROJECT) if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) endif() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT OFF) else() option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" ON) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) @@ -341,9 +342,14 @@ else() endif() endif(NOT MDBX_AMALGAMATED_SOURCE) - setup_compile_flags() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT ON) endif(SUBPROJECT) +option(MDBX_MANAGE_BUILD_FLAGS "Allow libmdbx to configure/manage/override its own build flags" ${MDBX_MANAGE_BUILD_FLAGS_DEFAULT}) +if(MDBX_MANAGE_BUILD_FLAGS) + setup_compile_flags() +endif() + list(FIND CMAKE_C_COMPILE_FEATURES c_std_11 HAS_C11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_11 HAS_CXX11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_14 HAS_CXX14) @@ -497,7 +503,7 @@ mark_as_advanced(MDBX_LOCKING) add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake" AUTO) mark_as_advanced(MDBX_TRUST_RTC) option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking" OFF) -option(MDBX_DISABLE_PAGECHECKS "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) +option(MDBX_DISABLE_VALIDATION "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) if(NOT MDBX_AMALGAMATED_SOURCE) if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") @@ -569,7 +575,10 @@ else() "${MDBX_SOURCE_DIR}/options.h" "${MDBX_SOURCE_DIR}/base.h" "${MDBX_SOURCE_DIR}/internals.h" "${MDBX_SOURCE_DIR}/osal.h" "${MDBX_SOURCE_DIR}/core.c" "${MDBX_SOURCE_DIR}/osal.c" - "${MDBX_SOURCE_DIR}/lck-posix.c" "${MDBX_SOURCE_DIR}/lck-windows.c") + "${MDBX_SOURCE_DIR}/lck-posix.c") + if(NOT APPLE) + list(APPEND LIBMDBX_SOURCES "${MDBX_SOURCE_DIR}/lck-windows.c") + endif() include_directories("${MDBX_SOURCE_DIR}") endif() endif(MDBX_AMALGAMATED_SOURCE) diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md b/crates/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md index 945b2eaab..f69516ccc 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md @@ -1,7 +1,128 @@ ChangeLog --------- -## v0.11.8 at 2022-06-12 +## v0.12.1 (Positive Proxima) at 2022-08-24 + +The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). + +``` +37 files changed, 7604 insertions(+), 7417 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +New: + + - The `Big Foot` feature which significantly reduces GC overhead for processing large lists of retired pages from huge transactions. + Now _libmdbx_ avoid creating large chunks of PNLs (page number lists) which required a long sequences of free pages, aka large/overflow pages. + Thus avoiding searching, allocating and storing such sequences inside GC. + - Improved hot/online validation and checking of database pages both for more robustness and performance. + - New solid and fast method to latch meta-pages called `Troika`. + The minimum of memory barriers, reads, comparisons and conditional transitions are used. + - New `MDBX_VALIDATION` environment options to extra validation of DB structure and pages content for carefully/safe handling damaged or untrusted DB. + - Accelerated ×16/×8/×4 by AVX512/AVX2/SSE2/Neon implementations of search page sequences. + - Added the `gcrtime_seconds16dot16` counter to the "Page Operation Statistics" that accumulates time spent for GC searching and reclaiming. + - Copy-with-compactification now clears/zeroes unused gaps inside database pages. + - The `C` and `C++` APIs has been extended and/or refined to simplify using `wchar_t` pathnames. + On Windows the `mdbx_env_openW()`, ``mdbx_env_get_pathW()`()`, `mdbx_env_copyW()`, `mdbx_env_open_for_recoveryW()` are available for now, + but the `mdbx_env_get_path()` has been replaced in favor of `mdbx_env_get_pathW()`. + - Added explicit error message for Buildroot's Microblaze toolchain maintainers. + - Added `MDBX_MANAGE_BUILD_FLAGS` build options for CMake. + - Speed-up internal `bsearch`/`lower_bound` implementation using branchless tactic, including workaround for CLANG x86 optimiser bug. + - A lot internal refinement and micro-optimisations. + - Internally counted volume of dirty pages (unused for now but for coming features). + +Fixes: + + - Never use modern `__cxa_thread_atexit()` on Apple's OSes. + - Don't check owner for finished transactions. + - Fixed typo in `MDBX_EINVAL` which breaks MingGW builds with CLANG. + + +## v0.12.0 at 2022-06-19 + +Not a release but preparation for changing feature set and API. + + +------------------------------------------------------------------------------- + + +## v0.11.10 (the TriColor) at 2022-08-22 + +The stable bugfix release. +It is planned that this will be the last release of the v0.11 branch. + +``` +14 files changed, 263 insertions(+), 252 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +New: + + - The C++ API has been refined to simplify support for `wchar_t` in path names. + - Added explicit error message for Buildroot's Microblaze toolchain maintainers. + +Fixes: + + - Never use modern `__cxa_thread_atexit()` on Apple's OSes. + - Use `MultiByteToWideChar(CP_THREAD_ACP)` instead of `mbstowcs()`. + - Don't check owner for finished transactions. + - Fixed typo in `MDBX_EINVAL` which breaks MingGW builds with CLANG. + +Minors: + + - Fixed variable name typo. + - Using `ldd` to check used dso. + - Added `MDBX_WEAK_IMPORT_ATTRIBUTE` macro. + - Use current transaction geometry for untouched parameters when `env_set_geometry()` called within a write transaction. + - Minor clarified `iov_page()` failure case. + +------------------------------------------------------------------------------- + + +## v0.11.9 (Чирчик-1992) at 2022-08-02 + +The stable bugfix release. + +``` +18 files changed, 318 insertions(+), 178 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Acknowledgements: + + - [Alex Sharov](https://github.com/AskAlexSharov) and Erigon team for reporting and testing. + - [Andrew Ashikhmin](https://gitflic.ru/user/yperbasis) for contributing. + +New: + + - Ability to customise `MDBX_LOCK_SUFFIX`, `MDBX_DATANAME`, `MDBX_LOCKNAME` just by predefine ones during build. + - Added to [`mdbx::env_managed`](https://libmdbx.dqdkfa.ru/group__cxx__api.html#classmdbx_1_1env__managed)'s methods a few overloads with `const char* pathname` parameter (C++ API). + +Fixes: + + - Fixed hang copy-with-compactification of a corrupted DB + or in case the volume of output pages is a multiple of `MDBX_ENVCOPY_WRITEBUF`. + - Fixed standalone non-CMake build on MacOS (`#include AvailabilityMacros.h>`). + - Fixed unexpected `MDBX_PAGE_FULL` error in rare cases with large database page sizes. + +Minors: + + - Minor fixes Doxygen references, comments, descriptions, etc. + - Fixed copy&paste typo inside `meta_checktxnid()`. + - Minor fix `meta_checktxnid()` to avoid assertion in debug mode. + - Minor fix `mdbx_env_set_geometry()` to avoid returning `EINVAL` in particular rare cases. + - Minor refine/fix batch-get testcase for large page size. + - Added `--pagesize NN` option to long-stotastic test script. + - Updated Valgrind-suppressions file for modern GCC. + - Fixed `has no symbols` warning from Apple's ranlib. + + +------------------------------------------------------------------------------- + + +## v0.11.8 (Baked Apple) at 2022-06-12 + +The stable release with an important fixes and workaround for the critical macOS thread-local-storage issue. Acknowledgements: @@ -25,6 +146,7 @@ Fixes: - Fixed `mdbx_check_fs_local()` for CDROM case on Windows. - Fixed nasty typo of typename which caused false `MDBX_CORRUPTED` error in a rare execution path, when the size of the thread-ID type not equal to 8. + - Fixed Elbrus/E2K LCC 1.26 compiler warnings (memory model for atomic operations, etc). - Fixed write-after-free memory corruption on latest `macOS` during finalization/cleanup of thread(s) that executed read transaction(s). > The issue was suddenly discovered by a [CI](https://en.wikipedia.org/wiki/Continuous_integration) > after adding an iteration with macOS 11 "Big Sur", and then reproduced on recent release of macOS 12 "Monterey". @@ -36,7 +158,6 @@ Fixes: > This is unexpected crazy-like behavior since the order of resources releasing/destroying > is not the reverse of ones acquiring/construction order. Nonetheless such surprise > is now workarounded by using atomic compare-and-swap operations on a 64-bit signatures/cookies. - - Fixed Elbrus/E2K LCC 1.26 compiler warnings (memory model for atomic operations, etc). Minors: @@ -51,7 +172,8 @@ Minors: ------------------------------------------------------------------------------- -## v0.11.7 at 2022-04-22 + +## v0.11.7 (Resurrected Sarmat) at 2022-04-22 The stable risen release after the Github's intentional malicious disaster. diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile b/crates/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile index d6b9fc7a7..e619d78bf 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile @@ -53,8 +53,9 @@ CFLAGS_EXTRA ?= LD ?= ld # build options -MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 +MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 MDBX_BUILD_TIMESTAMP ?=$(shell date +%Y-%m-%dT%H:%M:%S%z) +MDBX_BUILD_CXX ?= YES # probe and compose common compiler flags with variable expansion trick (seems this work two times per session for GNU Make 3.81) CFLAGS ?= $(strip $(eval CFLAGS := -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $$(shell for opt in -fno-semantic-interposition -Wno-unused-command-line-argument -Wno-tautological-compare; do [ -z "$$$$($(CC) '-DMDBX_BUILD_FLAGS="probe"' $$$${opt} -c $(SRC_PROBE_C) -o /dev/null >/dev/null 2>&1 || echo failed)" ] && echo "$$$${opt} "; done)$(CFLAGS_EXTRA))$(CFLAGS)) @@ -127,6 +128,9 @@ TIP := // TIP: .PHONY: all help options lib libs tools clean install uninstall check_buildflags_tag tools-static .PHONY: install-strip install-no-strip strip libmdbx mdbx show-options lib-static lib-shared +boolean = $(if $(findstring $(strip $($1)),YES Yes yes y ON On on 1 true True TRUE),1,$(if $(findstring $(strip $($1)),NO No no n OFF Off off 0 false False FALSE),,$(error Wrong value `$($1)` of $1 for YES/NO option))) +select_by = $(if $(call boolean,$(1)),$(2),$(3)) + ifeq ("$(origin V)", "command line") MDBX_BUILD_VERBOSE := $(V) endif @@ -134,7 +138,7 @@ ifndef MDBX_BUILD_VERBOSE MDBX_BUILD_VERBOSE := 0 endif -ifeq ($(MDBX_BUILD_VERBOSE),1) +ifeq ($(call boolean,MDBX_BUILD_VERBOSE),1) QUIET := HUSH := $(info $(TIP) Use `make V=0` for quiet.) @@ -169,12 +173,12 @@ help: show-options: @echo " MDBX_BUILD_OPTIONS = $(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_CXX = $(MDBX_BUILD_CXX)" @echo " MDBX_BUILD_TIMESTAMP = $(MDBX_BUILD_TIMESTAMP)" @echo '$(TIP) Use `make options` to listing available build options.' - @echo " CC =`which $(CC)` | `$(CC) --version | head -1`" - @echo " CFLAGS =$(CFLAGS)" - @echo " CXXFLAGS =$(CXXFLAGS)" - @echo " LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)" + @echo $(call select_by,MDBX_BUILD_CXX," CXX =`which $(CXX)` | `$(CXX) --version | head -1`"," CC =`which $(CC)` | `$(CC) --version | head -1`") + @echo $(call select_by,MDBX_BUILD_CXX," CXXFLAGS =$(CXXFLAGS)"," CFLAGS =$(CFLAGS)") + @echo $(call select_by,MDBX_BUILD_CXX," LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)"," LDFLAGS =$(LDFLAGS) $(LIBS) $(EXE_LDFLAGS)") @echo '$(TIP) Use `make help` to listing available targets.' options: @@ -221,7 +225,7 @@ clean: config.h src/config.h src/version.c *.tar* buildflags.tag \ mdbx_*.static mdbx_*.static-lto -MDBX_BUILD_FLAGS =$(strip $(MDBX_BUILD_OPTIONS) $(CXXSTD) $(CFLAGS) $(LDFLAGS) $(LIBS)) +MDBX_BUILD_FLAGS =$(strip MDBX_BUILD_CXX=$(MDBX_BUILD_CXX) $(MDBX_BUILD_OPTIONS) $(call select_by,MDBX_BUILD_CXX,$(CXXFLAGS) $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS),$(CFLAGS) $(LDFLAGS) $(LIBS))) check_buildflags_tag: $(QUIET)if [ "$(MDBX_BUILD_FLAGS)" != "$$(cat buildflags.tag 2>&1)" ]; then \ echo -n " CLEAN for build with specified flags..." && \ @@ -231,13 +235,13 @@ check_buildflags_tag: buildflags.tag: check_buildflags_tag -lib-static libmdbx.a: mdbx-static.o mdbx++-static.o +lib-static libmdbx.a: mdbx-static.o $(call select_by,MDBX_BUILD_CXX,mdbx++-static.o) @echo ' AR $@' $(QUIET)$(AR) rcs $@ $? $(HUSH) -lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o mdbx++-dylib.o +lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o $(call select_by,MDBX_BUILD_CXX,mdbx++-dylib.o) @echo ' LD $@' - $(QUIET)$(CXX) $(CXXFLAGS) $^ -pthread -shared $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) -o $@ + $(QUIET)$(call select_by,MDBX_BUILD_CXX,$(CXX) $(CXXFLAGS),$(CC) $(CFLAGS)) $^ -pthread -shared $(LDFLAGS) $(call select_by,MDBX_BUILD_CXX,$(LIB_STDCXXFS)) $(LIBS) -o $@ ################################################################################ @@ -340,13 +344,13 @@ else define bench-rule bench-$(1)_$(2).txt: $(3) $(IOARENA) $(lastword $(MAKEFILE_LIST)) @echo ' RUNNING ioarena for $1/$2...' - $(QUIET)LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ + $(QUIET)(export LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}"; \ + ldd $(IOARENA) && \ $(IOARENA) -D $(1) -B crud -m $(BENCH_CRUD_MODE) -n $(2) \ - | tee $$@ | grep throughput && \ - LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ - $(IOARENA) -D $(1) -B get,iterate -m $(BENCH_CRUD_MODE) -r 4 -n $(2) \ - | tee -a $$@ | grep throughput \ - || mv -f $$@ $$@.error + | tee $$@ | grep throughput && \ + $(IOARENA) -D $(1) -B iterate,get,iterate,get,iterate -m $(BENCH_CRUD_MODE) -r 4 -n $(2) \ + | tee -a $$@ | grep throughput \ + ) || mv -f $$@ $$@.error endef diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/README.md b/crates/libmdbx-rs/mdbx-sys/libmdbx/README.md index 1ac865233..3f78a26be 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/README.md +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/README.md @@ -97,6 +97,24 @@ _MithrilDB_ is a rightly relevant name. +``` +$ objdump -f -h -j .text libmdbx.so + + libmdbx.so: формат файла elf64-e2k + архитектура: elbrus-v6:64, флаги 0x00000150: + HAS_SYMS, DYNAMIC, D_PAGED + начальный адрес 0x0000000000021680 + + Разделы: + Idx Name Разм VMA LMA Фа смещ. Выр. + 10 .text 000ddd28 0000000000021680 0000000000021680 00021680 2**3 + CONTENTS, ALLOC, LOAD, READONLY, CODE + +$ cc --version + lcc:1.26.12:Jun-05-2022:e2k-v6-linux + gcc (GCC) 9.3.0 compatible +``` + ----- ## Table of Contents @@ -376,20 +394,20 @@ since release the version 1.0. _libmdbx_ provides two official ways for integration in source code form: -1. Using the amalgamated source code. - > The amalgamated source code includes all files required to build and +1. Using an amalgamated source code which available in the [releases section](https://gitflic.ru/project/erthink/libmdbx/release) on GitFlic. + > An amalgamated source code includes all files required to build and > use _libmdbx_, but not for testing _libmdbx_ itself. + > Beside the releases an amalgamated sources could be created any time from the original clone of git + > repository on Linux by executing `make dist`. As a result, the desired + > set of files will be formed in the `dist` subdirectory. -2. Adding the complete original source code as a `git submodule`. +2. Adding the complete source code as a `git submodule` from the [origin git repository](https://gitflic.ru/project/erthink/libmdbx) on GitFlic. > This allows you to build as _libmdbx_ and testing tool. > On the other hand, this way requires you to pull git tags, and use C++11 compiler for test tool. _**Please, avoid using any other techniques.**_ Otherwise, at least don't ask for support and don't name such chimeras `libmdbx`. -The amalgamated source code could be created from the original clone of git -repository on Linux by executing `make dist`. As a result, the desired -set of files will be formed in the `dist` subdirectory. ## Building and Testing diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt b/crates/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt index f3b6bd460..ba9b59b57 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt @@ -1 +1 @@ -0.11.8.0 +0.12.1.0 diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/config.h.in b/crates/libmdbx-rs/mdbx-sys/libmdbx/config.h.in index 7959699a3..58119c339 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/config.h.in +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/config.h.in @@ -26,7 +26,7 @@ #ifndef MDBX_TRUST_RTC_AUTO #cmakedefine01 MDBX_TRUST_RTC #endif -#cmakedefine01 MDBX_DISABLE_PAGECHECKS +#cmakedefine01 MDBX_DISABLE_VALIDATION /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 index c352c38fa..da2e78fb4 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ .\" Copyright 2015-2022 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_CHK 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 index 545126204..3cb97a343 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_COPY 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 index d8859a579..099c485b6 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ .\" Copyright 2021-2022 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_DROP 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 index 403b2faba..417488e7f 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_DUMP 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 index 01c58b011..4ab41fbfe 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_LOAD 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 index f2ebbcf98..a47d52f01 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_STAT 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c index 9ad4381bb..890bccd68 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -288,11 +288,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -436,8 +437,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -505,18 +507,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -678,6 +712,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -697,6 +733,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -741,6 +779,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -895,6 +956,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -958,7 +1029,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -978,7 +1049,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1016,8 +1087,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1025,8 +1096,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1035,36 +1106,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1076,23 +1147,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1110,7 +1184,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1120,7 +1194,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1133,7 +1213,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1141,13 +1221,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1157,12 +1237,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1173,8 +1253,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1184,15 +1264,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1201,57 +1281,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1260,25 +1340,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1286,17 +1367,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1306,7 +1388,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1319,24 +1401,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1352,7 +1433,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1373,7 +1454,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1381,14 +1462,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1401,14 +1482,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1423,15 +1504,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1441,14 +1522,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1685,6 +1780,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1694,11 +1801,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1957,14 +2064,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1973,6 +2077,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2041,8 +2149,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2098,15 +2206,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2141,26 +2249,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2268,7 +2376,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2287,11 +2398,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2334,21 +2448,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2365,6 +2482,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2384,16 +2509,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2401,17 +2529,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2528,7 +2656,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2554,7 +2682,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2661,6 +2789,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2712,6 +2841,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2723,7 +2861,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2746,9 +2884,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2758,7 +2896,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2785,6 +2923,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2809,11 +2948,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2854,8 +2993,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2865,18 +3004,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2909,13 +3057,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2926,18 +3076,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -2986,7 +3136,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -2995,11 +3145,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3034,142 +3184,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3194,15 +3340,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3264,8 +3410,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3408,7 +3552,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3433,15 +3578,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) /* @@ -3667,19 +3812,19 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( static __always_inline uint64_t unaligned_peek_u64_volatile(const unsigned expected_alignment, - volatile const void *const __restrict ptr) { + const volatile void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); assert(expected_alignment % sizeof(uint32_t) == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) - return *(volatile const uint64_t *)ptr; + return *(const volatile uint64_t *)ptr; else { #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ defined(_M_X64) || defined(_M_IA64) - return *(volatile const __unaligned uint64_t *)ptr; + return *(const volatile __unaligned uint64_t *)ptr; #else - const uint32_t lo = ((volatile const uint32_t *) + const uint32_t lo = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint32_t hi = ((volatile const uint32_t *) + const uint32_t hi = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; return lo | (uint64_t)hi << 32; #endif /* _MSC_VER || __unaligned */ @@ -3939,8 +4084,8 @@ __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, } size_t mdbx_default_pagesize(void) { - size_t pagesize = mdbx_syspagesize(); - mdbx_ensure(nullptr, is_powerof2(pagesize)); + size_t pagesize = osal_syspagesize(); + ENSURE(nullptr, is_powerof2(pagesize)); pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; return pagesize; @@ -3981,7 +4126,7 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, /* Calculate the size of a leaf node. * * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node + * is too large it will be put onto an large/overflow page and the node * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment * of the MDBX_node headers. */ @@ -3989,7 +4134,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { size_t node_bytes = node_size(key, data); if (node_bytes > env->me_leaf_nodemax) { - /* put on overflow page */ + /* put on large/overflow page */ node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); } @@ -3999,7 +4144,7 @@ leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { /* Calculate the size of a branch node. * * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow + * we currently don't support spilling large keys onto large/overflow * pages, it's simply the size of the MDBX_node header plus the * size of the key. Sizes are always rounded up to an even number * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. @@ -4014,7 +4159,7 @@ branch_size(const MDBX_env *env, const MDBX_val *key) { * This is just the node header plus the key, there is no data. */ size_t node_bytes = node_size(key, nullptr); if (unlikely(node_bytes > env->me_leaf_nodemax)) { - /* put on overflow page */ + /* put on large/overflow page */ /* not implemented */ mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, __LINE__); @@ -4047,7 +4192,7 @@ flags_db2sub(uint16_t db_flags) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { - mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + eASSERT(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } @@ -4058,7 +4203,7 @@ pgno2page(const MDBX_env *env, pgno_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { - mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + eASSERT(env, (env->me_psize >> env->me_psize2log) == 1); return (pgno_t)(bytes >> env->me_psize2log); } @@ -4105,6 +4250,7 @@ page_room(const MDBX_page *mp) { return mp->mp_upper - mp->mp_lower; } +/* Maximum free space in an empty page */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned page_space(const MDBX_env *env) { STATIC_ASSERT(PAGEHDRSZ % 2 == 0); @@ -4122,59 +4268,80 @@ page_fill(const MDBX_env *env, const MDBX_page *mp) { return page_used(env, mp) * 100.0 / page_space(env); } -/* The number of overflow pages needed to store the given size. */ +/* The number of large/overflow pages needed to store the given size. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t number_of_ovpages(const MDBX_env *env, size_t bytes) { return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; } -__cold static int MDBX_PRINTF_ARGS(2, 3) +__cold static const char *pagetype_caption(const uint8_t type, + char buf4unknown[16]) { + switch (type) { + case P_BRANCH: + return "branch"; + case P_LEAF: + return "leaf"; + case P_LEAF | P_SUBP: + return "subleaf"; + case P_LEAF | P_LEAF2: + return "dupfixed-leaf"; + case P_LEAF | P_LEAF2 | P_SUBP: + return "dupfixed-subleaf"; + case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY: + return "dupfixed-subleaf.legacy-dirty"; + case P_OVERFLOW: + return "large"; + default: + snprintf(buf4unknown, 16, "unknown_0x%x", type); + return buf4unknown; + } +} + +__cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { - if (mdbx_log_enabled(MDBX_LOG_ERROR)) { + if (LOG_ENABLED(MDBX_LOG_ERROR)) { static const MDBX_page *prev; if (prev != mp) { + char buf4unknown[16]; prev = mp; - const char *type; - switch (mp->mp_flags & (P_BRANCH | P_LEAF | P_OVERFLOW | P_META | - P_LEAF2 | P_BAD | P_SUBP)) { - case P_BRANCH: - type = "branch"; - break; - case P_LEAF: - type = "leaf"; - break; - case P_LEAF | P_SUBP: - type = "subleaf"; - break; - case P_LEAF | P_LEAF2: - type = "dupfixed-leaf"; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - type = "dupfixed-subleaf"; - break; - case P_OVERFLOW: - type = "large"; - break; - default: - type = "broken"; - } - mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", type, - mp->mp_pgno, mp->mp_txnid); + debug_log(MDBX_LOG_ERROR, "badpage", 0, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); } va_list args; va_start(args, fmt); - mdbx_debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); + debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); va_end(args); } return MDBX_CORRUPTED; } +__cold static void MDBX_PRINTF_ARGS(2, 3) + poor_page(const MDBX_page *mp, const char *fmt, ...) { + if (LOG_ENABLED(MDBX_LOG_NOTICE)) { + static const MDBX_page *prev; + if (prev != mp) { + char buf4unknown[16]; + prev = mp; + debug_log(MDBX_LOG_NOTICE, "poorpage", 0, + "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); + } + + va_list args; + va_start(args, fmt); + debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); + va_end(args); + } +} + /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * page_node(const MDBX_page *mp, unsigned i) { - assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); + assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); assert(page_numkeys(mp) > (unsigned)(i)); assert(mp->mp_ptrs[i] % 2 == 0); return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); @@ -4185,8 +4352,7 @@ page_node(const MDBX_page *mp, unsigned i) { * There are no node headers, keys are stored contiguously. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { - assert((mp->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_META)) == - (P_LEAF | P_LEAF2)); + assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); @@ -4205,120 +4371,6 @@ get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { get_key(node, keyptr); } -/*------------------------------------------------------------------------------ - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - if (unlikely(n > 42 - /* LY: align followed access if reasonable possible */ - && (((uintptr_t)s1) & 7) != 0 && - (((uintptr_t)s1) & 7) == (((uintptr_t)s2) & 7))) { - if (((uintptr_t)s1) & 1) { - const int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (diff) - return diff; - s1 = (char *)s1 + 1; - s2 = (char *)s2 + 1; - n -= 1; - } - - if (((uintptr_t)s1) & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - n -= 2; - } - - if (((uintptr_t)s1) & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - n -= 4; - } - } - - while (n >= 8) { - const uint64_t a = *(uint64_t *)s1; - const uint64_t b = *(uint64_t *)s2; - if (likely(a != b)) - return (__builtin_bswap64(a) > __builtin_bswap64(b)) ? 1 : -1; - s1 = (char *)s1 + 8; - s2 = (char *)s2 + 8; - n -= 8; - } - - if (n & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - } - - if (n & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - } - - return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0; -} - -int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - while (true) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - } -} - -int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - while (n > 0) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - n -= 1; - } - return 0; -} - -size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) { - size_t n = 0; - while (*s) { - s += 1; - n += 1; - } - return n; -} - -size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - size_t n = 0; - while (maxlen > n && *s) { - s += 1; - n += 1; - } - return n; -} -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - /*------------------------------------------------------------------------------ * safe read/write volatile 64-bit fields on 32-bit architectures. */ @@ -4333,16 +4385,16 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); - mdbx_jitter4testing(true); + jitter4testing(true); atomic_store32(&p->high, (uint32_t)(value >> 32), order); - mdbx_jitter4testing(true); + jitter4testing(true); #endif /* !MDBX_64BIT_ATOMIC */ return value; } @@ -4354,7 +4406,7 @@ MDBX_MAYBE_UNUSED static __always_inline #endif /* MDBX_64BIT_ATOMIC */ uint64_t - atomic_load64(const MDBX_atomic_uint64_t *p, + atomic_load64(const volatile MDBX_atomic_uint64_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); #if MDBX_64BIT_ATOMIC @@ -4362,26 +4414,26 @@ MDBX_MAYBE_UNUSED static assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint64_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); for (;;) { - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); if (likely(value == again)) return value; value = again; @@ -4511,40 +4563,34 @@ static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { return txnid; } -#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) -#define safe64_reset(p, single_writer) \ - atomic_store64(p, UINT64_MAX, \ - (single_writer) ? mo_AcquireRelease \ - : mo_SequentialConsistency) -#else +/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, bool single_writer) { -#if !MDBX_64BIT_CAS - if (!single_writer) { - STATIC_ASSERT(xMDBX_TXNID_STEP > 1); + if (single_writer) { +#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#else + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ + } else { +#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#elif MDBX_64BIT_CAS + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#else /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 * and overflow was preserved in safe64_txnid_next() */ + STATIC_ASSERT(xMDBX_TXNID_STEP > 1); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - atomic_store32( - &p->high, UINT32_MAX, - mo_Relaxed) /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - } else -#endif /* !MDBX_64BIT_CAS */ -#if MDBX_64BIT_ATOMIC - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ - atomic_store64(p, UINT64_MAX, - single_writer ? mo_AcquireRelease - : mo_SequentialConsistency); -#else - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ - atomic_store32(&p->high, UINT32_MAX, - single_writer ? mo_AcquireRelease : mo_SequentialConsistency); -#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ + } assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); } -#endif /* LCC && MDBX_HAVE_C11ATOMICS */ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, txnid_t compare) { @@ -4570,32 +4616,34 @@ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, rc = true; } #endif /* MDBX_64BIT_CAS */ - mdbx_jitter4testing(true); + jitter4testing(true); return rc; } static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(p->weak >= SAFE64_INVALID_THRESHOLD); -#if MDBX_64BIT_ATOMIC +#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS atomic_store64(p, v, mo_AcquireRelease); #else /* MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); #endif /* MDBX_64BIT_ATOMIC */ assert(p->weak == v); - mdbx_jitter4testing(true); + jitter4testing(true); } static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { - mdbx_jitter4testing(true); - uint64_t v = atomic_load64(p, mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); + uint64_t v; + do + v = atomic_load64(p, mo_AcquireRelease); + while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); return v; } @@ -4637,7 +4685,7 @@ MDBX_MAYBE_UNUSED static void safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(v > 0); - safe64_update(p, atomic_load64(p, mo_Relaxed) + v); + safe64_update(p, safe64_read(p) + v); } /*----------------------------------------------------------------------------*/ @@ -4646,7 +4694,7 @@ MDBX_MAYBE_UNUSED static typedef struct rthc_entry_t { MDBX_reader *begin; MDBX_reader *end; - mdbx_thread_key_t thr_tls_key; + osal_thread_key_t thr_tls_key; } rthc_entry_t; #if MDBX_DEBUG @@ -4665,11 +4713,11 @@ static CRITICAL_SECTION lcklist_critical_section; static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; -static mdbx_thread_key_t rthc_key; +static osal_thread_key_t rthc_key; static MDBX_atomic_uint32_t rthc_pending; static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { - uint64_t salt = mdbx_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ + uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return salt << 8 | kind; @@ -4713,55 +4761,50 @@ rthc_compare_and_clean(const void *rthc, const uint64_t signature) { static __inline int rthc_atexit(void (*dtor)(void *), void *obj, void *dso_symbol) { - int rc = MDBX_ENOSYS; +#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL +#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ + defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ + defined(ANDROID) +#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 +#else +#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 +#endif +#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */ -#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) -#if !defined(MAC_OS_X_VERSION_MIN_REQUIRED) || !defined(MAC_OS_X_VERSION_10_7) -#error \ - "The should be included and MAC_OS_X_VERSION_MIN_REQUIRED must be defined" -#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 +#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT +#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \ + defined(HAVE___CXA_THREAD_ATEXIT) +#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 +#elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \ + (defined(__linux__) || defined(__gnu_linux__)) +#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 +#else +#define MDBX_HAVE_CXA_THREAD_ATEXIT 0 +#endif +#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */ + + int rc = MDBX_ENOSYS; +#if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT +#define __cxa_thread_atexit __cxa_thread_atexit_impl +#endif +#if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit) + extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj, + void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE; + if (&__cxa_thread_atexit) + rc = __cxa_thread_atexit(dtor, obj, dso_symbol); +#elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE) extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr) - __attribute__((__weak__, __weak_import__)); - if (rc && &_tlv_atexit) { + MDBX_WEAK_IMPORT_ATTRIBUTE; + if (&_tlv_atexit) { (void)dso_symbol; _tlv_atexit(dtor, obj); rc = 0; } -#elif !defined(MDBX_HAVE_CXA_THREAD_ATEXIT) -#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 -#endif /* MAC_OS_X_VERSION_MIN_REQUIRED */ -#endif /* Apple */ - -#if defined(MDBX_HAVE_CXA_THREAD_ATEXIT) && MDBX_HAVE_CXA_THREAD_ATEXIT - extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj, - void *dso_symbol) -#ifdef WEAK_IMPORT_ATTRIBUTE - WEAK_IMPORT_ATTRIBUTE -#elif defined(MAC_OS_X_VERSION_MIN_REQUIRED) && \ - MAC_OS_X_VERSION_MIN_REQUIRED >= 1020 && \ - ((__has_attribute(__weak__) && __has_attribute(__weak_import__)) || \ - (defined(__GNUC__) && __GNUC__ >= 4)) - __attribute__((__weak__, __weak_import__)) -#elif (__has_attribute(__weak__) || (defined(__GNUC__) && __GNUC__ >= 4)) && \ - !defined(MAC_OS_X_VERSION_MIN_REQUIRED) - __attribute__((__weak__)) -#endif - ; - if (rc && &__cxa_thread_atexit) - rc = __cxa_thread_atexit(dtor, obj, dso_symbol); -#elif __GLIBC_PREREQ(2, 18) || defined(ANDROID) || defined(__linux__) || \ - defined(__gnu_linux__) - extern int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, - void *dso_symbol) - __attribute__((__weak__)); - if (rc && &__cxa_thread_atexit_impl) - rc = __cxa_thread_atexit_impl(dtor, obj, dso_symbol); #else (void)dtor; (void)obj; (void)dso_symbol; #endif - return rc; } @@ -4789,7 +4832,7 @@ static __inline void rthc_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&rthc_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); #endif } @@ -4797,11 +4840,11 @@ static __inline void rthc_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); #endif } -static __inline int thread_key_create(mdbx_thread_key_t *key) { +static __inline int thread_key_create(osal_thread_key_t *key) { int rc; #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); @@ -4809,22 +4852,22 @@ static __inline int thread_key_create(mdbx_thread_key_t *key) { #else rc = pthread_key_create(key, nullptr); #endif - mdbx_trace("&key = %p, value %" PRIuPTR ", rc %d", - __Wpedantic_format_voidptr(key), (uintptr_t)*key, rc); + TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), + (uintptr_t)*key, rc); return rc; } -static __inline void thread_key_delete(mdbx_thread_key_t key) { - mdbx_trace("key = %" PRIuPTR, (uintptr_t)key); +static __inline void thread_key_delete(osal_thread_key_t key) { + TRACE("key = %" PRIuPTR, (uintptr_t)key); #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsFree(key)); + ENSURE(nullptr, TlsFree(key)); #else - mdbx_ensure(nullptr, pthread_key_delete(key) == 0); + ENSURE(nullptr, pthread_key_delete(key) == 0); workaround_glibc_bug21031(); #endif } -static __inline void *thread_rthc_get(mdbx_thread_key_t key) { +static __inline void *thread_rthc_get(osal_thread_key_t key) { #if defined(_WIN32) || defined(_WIN64) return TlsGetValue(key); #else @@ -4832,9 +4875,9 @@ static __inline void *thread_rthc_get(mdbx_thread_key_t key) { #endif } -static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { +static void thread_rthc_set(osal_thread_key_t key, const void *value) { #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsSetValue(key, (void *)value)); + ENSURE(nullptr, TlsSetValue(key, (void *)value)); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); @@ -4842,102 +4885,59 @@ static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { if (value && unlikely(rthc_thread_state != sign_registered && rthc_thread_state != sign_counted)) { rthc_thread_state = sign_registered; - mdbx_trace("thread registered 0x%" PRIxPTR, mdbx_thread_self()); - if (rthc_atexit(mdbx_rthc_thread_dtor, &rthc_thread_state, + TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); + if (rthc_atexit(thread_dtor, &rthc_thread_state, (void *)&mdbx_version /* dso_anchor */)) { - mdbx_ensure(nullptr, - pthread_setspecific(rthc_key, &rthc_thread_state) == 0); + ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); rthc_thread_state = sign_counted; const unsigned count_before = atomic_add32(&rthc_pending, 1); - mdbx_ensure(nullptr, count_before < INT_MAX); - mdbx_notice("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", - (uintptr_t)rthc_key, count_before); + ENSURE(nullptr, count_before < INT_MAX); + NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", + (uintptr_t)rthc_key, count_before); (void)count_before; } } - mdbx_ensure(nullptr, pthread_setspecific(key, value) == 0); -#endif -} - -__cold void mdbx_rthc_global_init(void) { - rthc_limit = RTHC_INITIAL_LIMIT; - rthc_table = rthc_table_static; -#if defined(_WIN32) || defined(_WIN64) - InitializeCriticalSection(&rthc_critical_section); - InitializeCriticalSection(&lcklist_critical_section); -#else - mdbx_ensure(nullptr, - pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0); - mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(), - __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); -#endif - /* checking time conversion, this also avoids racing on 32-bit architectures - * during writing calculated 64-bit ratio(s) into memory. */ - uint32_t proba = UINT32_MAX; - while (true) { - unsigned time_conversion_checkup = - mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba)); - unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; - unsigned one_less = (proba > 0) ? proba - 1 : proba; - mdbx_ensure(nullptr, time_conversion_checkup >= one_less && - time_conversion_checkup <= one_more); - if (proba == 0) - break; - proba >>= 1; - } - - bootid = mdbx_osal_bootid(); -#if 0 /* debug */ - for (unsigned i = 0; i < 65536; ++i) { - size_t pages = pv2pages(i); - unsigned x = pages2pv(pages); - size_t xp = pv2pages(x); - if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) - printf("%u => %zu => %u => %zu\n", i, pages, x, xp); - assert(pages == xp); - } - fflush(stdout); + ENSURE(nullptr, pthread_setspecific(key, value) == 0); #endif } /* dtor called for thread, i.e. for all mdbx's environment objects */ -__cold void mdbx_rthc_thread_dtor(void *rthc) { +__cold void thread_dtor(void *rthc) { rthc_lock(); - mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(), - mdbx_thread_self(), rthc); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), + osal_thread_self(), rthc); - const uint32_t self_pid = mdbx_getpid(); + const uint32_t self_pid = osal_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const osal_thread_key_t key = rthc_table[i].thr_tls_key; MDBX_reader *const reader = thread_rthc_get(key); if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) continue; #if !defined(_WIN32) && !defined(_WIN64) if (pthread_setspecific(key, nullptr) != 0) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p: ignore race with tsd-key deletion", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p: ignore race with tsd-key deletion", + osal_thread_self(), __Wpedantic_format_voidptr(reader)); continue /* ignore race with tsd-key deletion by mdbx_env_close() */; } #endif - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " - "current-pid %i", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, - self_pid); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " + "current-pid %i", + osal_thread_self(), __Wpedantic_format_voidptr(reader), i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { - mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); + TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), + __Wpedantic_format_voidptr(reader)); atomic_cas32(&reader->mr_pid, self_pid, 0); } } #if defined(_WIN32) || defined(_WIN64) - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); rthc_unlock(); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); @@ -4945,28 +4945,28 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "wrong", state); } if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { - mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", - mdbx_thread_self(), rthc, mdbx_getpid()); - mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), + rthc, osal_getpid()); + ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); } - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); /* Allow tail call optimization, i.e. gcc should generate the jmp instruction * instead of a call for pthread_mutex_unlock() and therefore CPU could not * return to current DSO's code section, which may be unloaded immediately @@ -4975,44 +4975,45 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { #endif } -__cold void mdbx_rthc_global_dtor(void) { - mdbx_trace(">> pid %d", mdbx_getpid()); +MDBX_EXCLUDE_FOR_GPROF +__cold void global_dtor(void) { + TRACE(">> pid %d", osal_getpid()); rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) uint64_t *rthc = pthread_getspecific(rthc_key); - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status 0x%08" PRIx64 ", left %d", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), rthc ? rthc_read(rthc) : ~UINT64_C(0), - atomic_load32(&rthc_pending, mo_Relaxed)); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 + ", left %d", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + rthc ? rthc_read(rthc) : ~UINT64_C(0), + atomic_load32(&rthc_pending, mo_Relaxed)); if (rthc) { const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), + osal_getpid(), "wrong", state); } } struct timespec abstime; - mdbx_ensure(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); + ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); abstime.tv_nsec += 1000000000l / 10; if (abstime.tv_nsec >= 1000000000l) { abstime.tv_nsec -= 1000000000l; @@ -5024,8 +5025,7 @@ __cold void mdbx_rthc_global_dtor(void) { for (unsigned left; (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - mdbx_notice("tls-cleanup: pid %d, pending %u, wait for...", mdbx_getpid(), - left); + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; @@ -5033,29 +5033,28 @@ __cold void mdbx_rthc_global_dtor(void) { thread_key_delete(rthc_key); #endif - const uint32_t self_pid = mdbx_getpid(); + const uint32_t self_pid = osal_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const osal_thread_key_t key = rthc_table[i].thr_tls_key; thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - mdbx_trace( - "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " - "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), - rthc->mr_pid.weak, self_pid); + TRACE("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), + rthc->mr_pid.weak, self_pid); if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } } rthc_limit = rthc_count = 0; if (rthc_table != rthc_table_static) - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = nullptr; rthc_unlock(); @@ -5068,22 +5067,22 @@ __cold void mdbx_rthc_global_dtor(void) { workaround_glibc_bug21031(); #endif - mdbx_trace("<< pid %d\n", mdbx_getpid()); + TRACE("<< pid %d\n", osal_getpid()); } -__cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, - MDBX_reader *end) { +__cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, + MDBX_reader *end) { assert(pkey != NULL); #ifndef NDEBUG - *pkey = (mdbx_thread_key_t)0xBADBADBAD; + *pkey = (osal_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ rthc_lock(); - mdbx_trace(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); + TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); int rc; if (rthc_count == rthc_limit) { rthc_entry_t *new_table = - mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, + osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); if (new_table == nullptr) { rc = MDBX_ENOMEM; @@ -5100,15 +5099,14 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, goto bailout; *pkey = rthc_table[rthc_count].thr_tls_key; - mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, - (uintptr_t)*pkey, __Wpedantic_format_voidptr(begin), - __Wpedantic_format_voidptr(end)); + TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); rthc_table[rthc_count].begin = begin; rthc_table[rthc_count].end = end; ++rthc_count; - mdbx_trace("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", - (uintptr_t)*pkey, rthc_count, rthc_limit); + TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, + rthc_count, rthc_limit); rthc_unlock(); return MDBX_SUCCESS; @@ -5117,30 +5115,30 @@ bailout: return rc; } -__cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { +__cold void rthc_remove(const osal_thread_key_t key) { thread_key_delete(key); rthc_lock(); - mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, - rthc_count, rthc_limit); + TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, + rthc_limit); for (unsigned i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].thr_tls_key) { - const uint32_t self_pid = mdbx_getpid(); - mdbx_trace("== [%i], %p ...%p, current-pid %d", i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); + const uint32_t self_pid = osal_getpid(); + TRACE("== [%i], %p ...%p, current-pid %d", i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = rthc_table_static; rthc_limit = RTHC_INITIAL_LIMIT; } @@ -5148,8 +5146,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { } } - mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, - rthc_count, rthc_limit); + TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, + rthc_limit); rthc_unlock(); } @@ -5162,7 +5160,7 @@ static __inline void lcklist_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&lcklist_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); #endif } @@ -5170,7 +5168,7 @@ static __inline void lcklist_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); #endif } @@ -5183,7 +5181,7 @@ MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { return v ^ v >> 28; } -static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { +static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { int rc; uint64_t bait; MDBX_lockinfo *const pending_lck = pending->lck; @@ -5193,42 +5191,40 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { rc = MDBX_SUCCESS; } else { bait = 0 /* hush MSVC warning */; - rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); + rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); if (rc == MDBX_SUCCESS) - rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), + rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), offsetof(MDBX_lockinfo, mti_bait_uniqueness)); } if (likely(rc == MDBX_SUCCESS) && bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) rc = MDBX_RESULT_TRUE; - mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", - pending_lck ? "mem" : "file", bait, - (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending_lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); return rc; } -static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, +static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, uint64_t *abra) { if (*abra == 0) { - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); uintptr_t uit = 0; memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); - *abra = - rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); + *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); } const uint64_t cadabra = - rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) << 24 | *abra >> 40; MDBX_lockinfo *const scan_lck = scan->lck; - atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, - mo_SequentialConsistency); + atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease); *abra = *abra * UINT64_C(6364136223846793005) + 1; return uniq_peek(pending, scan); } -__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { +__cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { *found = nullptr; uint64_t salt = 0; for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; @@ -5239,33 +5235,33 @@ __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { uint64_t length; - if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS && + if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ - mdbx_debug("uniq-probe: %s", "unique (new/empty lck)"); + DEBUG("uniq-probe: %s", "unique (new/empty lck)"); return MDBX_RESULT_TRUE; } } if (err == MDBX_RESULT_TRUE) err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { - (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), + (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_NONE); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); *found = scan; - mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); + DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); return MDBX_RESULT_FALSE; } if (unlikely(err != MDBX_SUCCESS)) { - mdbx_debug("uniq-probe: failed rc %d", err); + DEBUG("uniq-probe: failed rc %d", err); return err; } } - mdbx_debug("uniq-probe: %s", "unique"); + DEBUG("uniq-probe: %s", "unique"); return MDBX_RESULT_TRUE; } @@ -5273,8 +5269,8 @@ static int lcklist_detach_locked(MDBX_env *env) { MDBX_env *inprocess_neighbor = nullptr; int rc = MDBX_SUCCESS; if (env->me_lcklist_next != nullptr) { - mdbx_ensure(env, env->me_lcklist_next != nullptr); - mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END); + ENSURE(env, env->me_lcklist_next != nullptr); + ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; ptr = &(*ptr)->me_lcklist_next) { if (*ptr == env) { @@ -5283,16 +5279,16 @@ static int lcklist_detach_locked(MDBX_env *env) { break; } } - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); } - rc = likely(mdbx_getpid() == env->me_pid) + rc = likely(osal_getpid() == env->me_pid) ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) : MDBX_PANIC; if (!inprocess_neighbor && env->me_live_reader) - (void)mdbx_rpid_clear(env); + (void)osal_rpid_clear(env); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_lck_destroy(env, inprocess_neighbor); + rc = osal_lck_destroy(env, inprocess_neighbor); return rc; } @@ -5301,13 +5297,24 @@ static int lcklist_detach_locked(MDBX_env *env) { * and network-sort for small chunks. * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ +#if MDBX_HAVE_CMOV #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do { \ const TYPE swap_tmp = (a); \ - const bool swap_cmp = CMP(swap_tmp, b); \ + const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ (a) = swap_cmp ? swap_tmp : b; \ (b) = swap_cmp ? b : swap_tmp; \ } while (0) +#else +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do \ + if (expect_with_probability(!CMP(a, b), 0, .5)) { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } \ + while (0) +#endif // 3 comparators, 3 parallel operations // o-----^--^--o @@ -5498,674 +5505,10 @@ static int lcklist_detach_locked(MDBX_env *env) { SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ } while (0) -// 25 comparators, 9 parallel operations -// o--^-----^--^-----^-----------------------------------o -// | | | | -// o--v--^--v--|-----|--^-----^-----------^--------------o -// | | | | | | -// o-----v-----|-----|--|-----|--^-----^--|--^-----^--^--o -// | | | | | | | | | | -// o--^-----^--v--^--v--|-----|--|-----|--v--|-----|--v--o -// | | | | | | | | | -// o--v--^--v-----|-----v--^--v--|-----|-----|--^--v-----o -// | | | | | | | -// o-----v--------|--------|-----v--^--v--^--|--|--^-----o -// | | | | | | | -// o--^-----^-----v--------|--------|-----|--v--v--v-----o -// | | | | | -// o--v--^--v--------------v--------|-----v--------------o -// | | -// o-----v--------------------------v--------------------o -// -// [[0,1],[3,4],[6,7]] -// [[1,2],[4,5],[7,8]] -// [[0,1],[3,4],[6,7],[2,5]] -// [[0,3],[1,4],[5,8]] -// [[3,6],[4,7],[2,5]] -// [[0,3],[1,4],[5,7],[2,6]] -// [[1,3],[4,6]] -// [[2,4],[5,6]] -// [[2,3]] -#define SORT_NETWORK_9(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - } while (0) - -// 29 comparators, 9 parallel operations -// o--------------^-----^--^--^-----------------------o -// | | | | -// o-----------^--|--^--|--|--v--^--------^-----------o -// | | | | | | | -// o--------^--|--|--|--|--v--^--v-----^--|--^--------o -// | | | | | | | | | -// o-----^--|--|--|--|--v--^--|-----^--|--v--v--^-----o -// | | | | | | | | | | -// o--^--|--|--|--|--v-----|--v--^--|--|--^-----v--^--o -// | | | | | | | | | | | -// o--|--|--|--|--v--^-----|--^--|--v--v--|-----^--v--o -// | | | | | | | | | | -// o--|--|--|--v--^--|-----v--|--v--^-----|--^--v-----o -// | | | | | | | | | -// o--|--|--v-----|--|--^-----v--^--|-----v--v--------o -// | | | | | | | -// o--|--v--------|--v--|--^-----v--v-----------------o -// | | | | -// o--v-----------v-----v--v--------------------------o -// -// [[4,9],[3,8],[2,7],[1,6],[0,5]] -// [[1,4],[6,9],[0,3],[5,8]] -// [[0,2],[3,6],[7,9]] -// [[0,1],[2,4],[5,7],[8,9]] -// [[1,2],[4,6],[7,8],[3,5]] -// [[2,5],[6,8],[1,3],[4,7]] -// [[2,3],[6,7]] -// [[3,4],[5,6]] -// [[4,5]] -#define SORT_NETWORK_10(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - } while (0) - -// 35 comparators, 9 parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|-----v--|--^-----|-----^--|--^--o -// | | | | | | | | | -// o--^--^--------|--|-----|--------v--|-----v--^--|--v--v--o -// | | | | | | | | -// o--v--|--^-----|--v-----|-----------|--------v--v--------o -// | | | | | -// o-----v--v-----v--------v-----------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9]] -// [[1,3],[5,7],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[3,7]] -// [[1,5],[6,10],[4,8]] -// [[5,9],[2,6],[0,4],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_11(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 39 comparators, parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|--^--v--|--^--^--|-----^--|--^--o -// | | | | | | | | | | | -// o--^-----^-----|--|-----|--|-----v--|--|--v--^--|--v--v--o -// | | | | | | | | | | -// o--v--^--|--^--|--v-----|--|--------|--|-----v--v--------o -// | | | | | | | | -// o--^--|--v--v--v--------v--|--------|--v-----------------o -// | | | | -// o--v--v--------------------v--------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]] -// [[1,3],[5,7],[9,11],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[7,11]] -// [[1,5],[6,10],[3,7],[4,8]] -// [[5,9],[2,6],[0,4],[7,11],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_12(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 45 comparators, 10 parallel operations -// o--------^--^-----^-----------------------------^-----------------o -// | | | | -// o--^-----|--v-----|-----^--------------^-----^--|-----^-----------o -// | | | | | | | | -// o--|-----|--^--^--v-----|--------------|--^--|--|--^--v--^--------o -// | | | | | | | | | | | -// o--|--^--|--|--v-----^--|--------^-----|--|--v--|--|--^--v-----^--o -// | | | | | | | | | | | | | -// o--|--v--|--|--^-----|--v-----^--v-----|--|--^--|--|--|--^--^--v--o -// | | | | | | | | | | | | | | -// o--|--^--|--|--|--^--|--------|-----^--|--|--|--v--v--v--|--v--^--o -// | | | | | | | | | | | | | | -// o--|--|--|--v--v--|--|--^-----|--^--v--|--v--|--^--------v--^--v--o -// | | | | | | | | | | | | -// o--v--|--|-----^--|--v--|--^--|--|-----v-----v--|--^--------v-----o -// | | | | | | | | | | -// o-----v--|--^--|--|-----|--v--|--|--^-----^-----v--v--^-----------o -// | | | | | | | | | | -// o--^-----|--|--|--v-----|-----v--|--v--^--|--^--------v-----------o -// | | | | | | | | | -// o--|-----|--|--|--^-----|--------v--^--|--v--v--------------------o -// | | | | | | | | -// o--v-----|--v--|--v-----|--^--------v--v--------------------------o -// | | | | -// o--------v-----v--------v--v--------------------------------------o -// -// [[1,7],[9,11],[3,4],[5,8],[0,12],[2,6]] -// [[0,1],[2,3],[4,6],[8,11],[7,12],[5,9]] -// [[0,2],[3,7],[10,11],[1,4],[6,12]] -// [[7,8],[11,12],[4,9],[6,10]] -// [[3,4],[5,6],[8,9],[10,11],[1,7]] -// [[2,6],[9,11],[1,3],[4,7],[8,10],[0,5]] -// [[2,5],[6,8],[9,10]] -// [[1,2],[3,5],[7,8],[4,6]] -// [[2,3],[4,5],[6,7],[8,9]] -// [[3,4],[5,6]] -#define SORT_NETWORK_13(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - } while (0) - - -// 51 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | -// o--^--v--|--|--|--------------v--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | -// o--v-----v--|--|-----------------v--|--|--------|--v-----|--^--------|--|--^--------o -// | | | | | | | | | | -// o--^--------v--|--------------------v--|--------v--------|--|--------v--v--v--------o -// | | | | | -// o--v-----------v-----------------------v-----------------v--v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13]] -// [[5,10],[6,9],[3,12],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - - -#define SORT_NETWORK_14(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - - -// 56 comparators, 10 parallel operations -// o--^--^-----^-----------^--------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^--------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v--------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^-----------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^--------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^-----^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | -// o--v-----v--|--|--|--------------v--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | -// o--^--^-----v--|--|-----------------v--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | -// o--v--|--------v--|--------------------v--|--^--------------v--|--v--------------------o -// | | | | | -// o-----v-----------v-----------------------v--v-----------------v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - - -#define SORT_NETWORK_15(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - - -// 60 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--------^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^-----|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--^--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | | -// o--v-----v--|--|--|--^-----------v--|--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--------------v--|--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | | | -// o--v--|--^-----v--|--|-----------------v--|--|--^--------------v--|--v--------------------o -// | | | | | | | | -// o--^--v--|--------v--|--------------------v--|--v-----------------v-----------------------o -// | | | | -// o--v-----v-----------v-----------------------v--------------------------------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11],[13,15]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7],[11,15]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14],[7,15]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - - -#define SORT_NETWORK_16(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[14], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - #define SORT_INNER(TYPE, CMP, begin, end, len) \ switch (len) { \ default: \ + assert(false); \ __unreachable(); \ case 0: \ case 1: \ @@ -6191,30 +5534,6 @@ static int lcklist_detach_locked(MDBX_env *env) { case 8: \ SORT_NETWORK_8(TYPE, CMP, begin); \ break; \ - case 9: \ - SORT_NETWORK_9(TYPE, CMP, begin); \ - break; \ - case 10: \ - SORT_NETWORK_10(TYPE, CMP, begin); \ - break; \ - case 11: \ - SORT_NETWORK_11(TYPE, CMP, begin); \ - break; \ - case 12: \ - SORT_NETWORK_12(TYPE, CMP, begin); \ - break; \ - case 13: \ - SORT_NETWORK_13(TYPE, CMP, begin); \ - break; \ - case 14: \ - SORT_NETWORK_14(TYPE, CMP, begin); \ - break; \ - case 15: \ - SORT_NETWORK_15(TYPE, CMP, begin); \ - break; \ - case 16: \ - SORT_NETWORK_16(TYPE, CMP, begin); \ - break; \ } #define SORT_SWAP(TYPE, a, b) \ @@ -6242,7 +5561,7 @@ static int lcklist_detach_locked(MDBX_env *env) { \ static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ while (++first <= last) \ - if (CMP(first[0], first[-1])) \ + if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ return false; \ return true; \ } \ @@ -6251,14 +5570,15 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *lo, *hi; \ } NAME##_stack; \ \ - static __hot void NAME(TYPE *const begin, TYPE *const end) { \ - NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack; \ + __hot static void NAME(TYPE *const __restrict begin, \ + TYPE *const __restrict end) { \ + NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack; \ \ - TYPE *hi = end - 1; \ - TYPE *lo = begin; \ + TYPE *__restrict hi = end - 1; \ + TYPE *__restrict lo = begin; \ while (true) { \ const ptrdiff_t len = hi - lo; \ - if (len < 16) { \ + if (len < 8) { \ SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ if (unlikely(top == stack)) \ break; \ @@ -6266,7 +5586,7 @@ static int lcklist_detach_locked(MDBX_env *env) { continue; \ } \ \ - TYPE *mid = lo + (len >> 1); \ + TYPE *__restrict mid = lo + (len >> 1); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ @@ -6274,9 +5594,9 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *right = hi - 1; \ TYPE *left = lo + 1; \ while (1) { \ - while (CMP(*left, *mid)) \ + while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ ++left; \ - while (CMP(*mid, *right)) \ + while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ --right; \ if (unlikely(left > right)) { \ if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ @@ -6302,7 +5622,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } \ } \ \ - if (mdbx_audit_enabled()) { \ + if (AUDIT_ENABLED()) { \ for (TYPE *scan = begin + 1; scan < end; ++scan) \ assert(CMP(scan[-1], scan[0])); \ } \ @@ -6320,7 +5640,7 @@ static int lcklist_detach_locked(MDBX_env *env) { tmp = begin + length + END_GAP; \ /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ } else { \ - tmp = mdbx_malloc(sizeof(TYPE) * length); \ + tmp = osal_malloc(sizeof(TYPE) * length); \ if (unlikely(!tmp)) \ return false; \ } \ @@ -6373,64 +5693,85 @@ static int lcklist_detach_locked(MDBX_env *env) { } while (key_diff_mask >> 16); \ \ if (!(BUFFER_PREALLOCATED)) \ - mdbx_free(tmp); \ + osal_free(tmp); \ return true; \ } /*------------------------------------------------------------------------------ * LY: Binary search */ +#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do \ + __asm __volatile("" \ + : "+r"(size) \ + : "r" /* the `b` constraint is more suitable here, but \ + cause CLANG to allocate and push/pop an one more \ + register, so using the `r` which avoids this. */ \ + (flag)); \ + while (0) +#else +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do { \ + /* nope for non-clang or non-x86 */; \ + } while (0) +#endif /* Workaround for CLANG */ + +#define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key) \ + do { \ + } while (0) + #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ - const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \ - const TYPE_LIST *const begin = first, *const end = begin + length; \ + const TYPE_LIST *it, unsigned length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = it, *const end = begin + length; \ \ - while (length > 3) { \ - const unsigned whole = length; \ - length >>= 1; \ - const TYPE_LIST *const middle = first + length; \ - const unsigned left = whole - length - 1; \ - const bool cmp = CMP(*middle, item); \ - length = cmp ? left : length; \ - first = cmp ? middle + 1 : first; \ - } \ + if (MDBX_HAVE_CMOV) \ + do { \ + /* Адаптивно-упрощенный шаг двоичного поиска: \ + * - без переходов при наличии cmov или аналога; \ + * - допускает лишние итерации; \ + * - но ищет пока size > 2, что требует дозавершения поиска \ + * среди остающихся 0-1-2 элементов. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ + it = flag ? middle : it; \ + } while (length > 2); \ + else \ + while (length > 2) { \ + /* Вариант с использованием условного перехода. Основное отличие в \ + * том, что при "не равно" (true от компаратора) переход делается на 1 \ + * ближе к концу массива. Алгоритмически это верно и обеспечивает \ + * чуть-чуть более быструю сходимость, но зато требует больше \ + * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ + * спекулятивное выполнение при size == 0. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + if (flag) { \ + it = middle + 1; \ + length -= 1; \ + } \ + } \ + it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ + it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ \ - switch (length) { \ - case 3: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 2: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 1: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 0: \ - break; \ - default: \ - __unreachable(); \ - } \ - \ - if (mdbx_audit_enabled()) { \ - for (const TYPE_LIST *scan = begin; scan < first; ++scan) \ + if (AUDIT_ENABLED()) { \ + for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ assert(CMP(*scan, item)); \ - for (const TYPE_LIST *scan = first; scan < end; ++scan) \ + for (const TYPE_LIST *scan = it; scan < end; ++scan) \ assert(!CMP(*scan, item)); \ (void)begin, (void)end; \ } \ \ - return first; \ + return it; \ } /*----------------------------------------------------------------------------*/ -static __always_inline size_t pnl2bytes(size_t size) { +static __always_inline size_t pnl_size2bytes(size_t size) { assert(size > 0 && size <= MDBX_PGL_LIMIT); #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size += size; @@ -6447,7 +5788,7 @@ static __always_inline size_t pnl2bytes(size_t size) { return bytes; } -static __always_inline pgno_t bytes2pnl(const size_t bytes) { +static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); size -= 2; @@ -6457,14 +5798,14 @@ static __always_inline pgno_t bytes2pnl(const size_t bytes) { return (pgno_t)size; } -static MDBX_PNL mdbx_pnl_alloc(size_t size) { - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_malloc(bytes); +static MDBX_PNL pnl_alloc(size_t size) { + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_malloc(bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - pl[0] = bytes2pnl(bytes); + pl[0] = pnl_bytes2size(bytes); assert(pl[0] >= size); pl[1] = 0; pl += 1; @@ -6472,34 +5813,35 @@ static MDBX_PNL mdbx_pnl_alloc(size_t size) { return pl; } -static void mdbx_pnl_free(MDBX_PNL pl) { +static void pnl_free(MDBX_PNL pl) { if (likely(pl)) - mdbx_free(pl - 1); + osal_free(pl - 1); } /* Shrink the PNL to the default size if it has grown larger */ -static void mdbx_pnl_shrink(MDBX_PNL *ppl) { - assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && - bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); +static void pnl_shrink(MDBX_PNL *ppl) { + assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && + pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < + MDBX_PNL_INITIAL * 3 / 2); assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); MDBX_PNL_SIZE(*ppl) = 0; if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); *ppl = pl + 1; } } } /* Grow the PNL to the size growed to at least given size */ -static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { +static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); @@ -6507,20 +5849,20 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { - mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); + ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) ? wanna + wanna - allocated : MDBX_PGL_LIMIT; - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); assert(*pl >= wanna); *ppl = pl + 1; return MDBX_SUCCESS; @@ -6529,20 +5871,19 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, - size_t num) { +static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, + size_t num) { assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) - ? MDBX_SUCCESS - : mdbx_pnl_reserve(ppl, wanna); + return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS + : pnl_reserve(ppl, wanna); } -static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { +static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) assert(pgno != pl[i]); } @@ -6551,10 +5892,12 @@ static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { } /* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result -mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { +__always_inline static int __must_check_result pnl_append_range(bool spilled, + MDBX_PNL *ppl, + pgno_t pgno, + unsigned n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -6579,11 +5922,10 @@ mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { } /* Append an pgno range into the sorted PNL */ -static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, - pgno_t pgno, - unsigned n) { +__hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, + pgno_t pgno, unsigned n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -6599,61 +5941,87 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, return MDBX_SUCCESS; } -static bool mdbx_pnl_check(const MDBX_PNL pl, const size_t limit) { +__hot static bool pnl_check(const pgno_t *pl, const size_t limit) { assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); if (likely(MDBX_PNL_SIZE(pl))) { - assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); - assert(MDBX_PNL_MOST(pl) < limit); - assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT); if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) return false; if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) return false; if (unlikely(MDBX_PNL_MOST(pl) >= limit)) return false; - if (mdbx_audit_enabled()) { - for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { - assert(MDBX_PNL_ORDERED(scan[0], scan[1])); - if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) + + if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && + likely(MDBX_PNL_SIZE(pl) > 1)) { + const pgno_t *scan = MDBX_PNL_BEGIN(pl); + const pgno_t *const end = MDBX_PNL_END(pl); + pgno_t prev = *scan++; + do { + if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) return false; - } + prev = *scan; + } while (likely(++scan != end)); } } return true; } -static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, - const size_t limit) { - if (unlikely(pl == nullptr)) - return true; - assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); - if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl))) - return false; - return mdbx_pnl_check(pl, limit); +static __always_inline bool pnl_check_allocated(const pgno_t *pl, + const size_t limit) { + return pl == nullptr || + (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl) && pnl_check(pl, limit)); } -/* Merge an PNL onto an PNL. The destination PNL must be big enough */ -static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); - assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); - const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); - assert(MDBX_PNL_ALLOCLEN(dst) >= total); - pgno_t *w = dst + total; - pgno_t *d = dst + MDBX_PNL_SIZE(dst); - const pgno_t *s = src + MDBX_PNL_SIZE(src); - dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); - while (s > src) { - while (MDBX_PNL_ORDERED(*s, *d)) - *w-- = *d--; - *w-- = *s--; +static __always_inline void +pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, + const pgno_t *__restrict src_b, + const pgno_t *__restrict const src_b_detent) { + do { +#if MDBX_HAVE_CMOV + const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) + // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode + // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) + // gcc<=6: cmov×3 + // clang<=12: cmov×3 + // clang>=13: cmov, set+add/sub + *dst = flag ? *src_a-- : *src_b--; +#else + // gcc: cmov, cmp+set+add/sub + // clang<=5: cmov×2, set+add/sub + // clang>=6: cmov, set+add/sub + *dst = flag ? *src_a : *src_b; + src_b += flag - 1; + src_a -= flag; +#endif + --dst; +#else /* MDBX_HAVE_CMOV */ + while (MDBX_PNL_ORDERED(*src_b, *src_a)) + *dst-- = *src_a--; + *dst-- = *src_b--; +#endif /* !MDBX_HAVE_CMOV */ + } while (likely(src_b > src_b_detent)); +} + +/* Merge a PNL onto a PNL. The destination PNL must be big enough */ +__hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + assert(pnl_check(src, MAX_PAGENO + 1)); + const pgno_t src_len = MDBX_PNL_SIZE(src); + const pgno_t dst_len = MDBX_PNL_SIZE(dst); + if (likely(src_len > 0)) { + const pgno_t total = dst_len + src_len; + assert(MDBX_PNL_ALLOCLEN(dst) >= total); + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + MDBX_PNL_SIZE(dst) = total; } - MDBX_PNL_SIZE(dst) = (pgno_t)total; - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); } -static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { - mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && - txn->tw.spill_least_removed > 0); +static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && + txn->tw.spill_least_removed > 0); txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; txn->tw.spill_pages[idx] |= 1; @@ -6680,8 +6048,8 @@ static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { } } -static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.spill_least_removed > 0); +static MDBX_PNL spill_purge(MDBX_txn *txn) { + tASSERT(txn, txn->tw.spill_least_removed > 0); const MDBX_PNL sl = txn->tw.spill_pages; if (txn->tw.spill_least_removed != INT_MAX) { unsigned len = MDBX_PNL_SIZE(sl), r, w; @@ -6690,12 +6058,12 @@ static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); + tASSERT(txn, (sl[i] & 1) == 0); MDBX_PNL_SIZE(sl) = w - 1; txn->tw.spill_least_removed = INT_MAX; } else { for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); + tASSERT(txn, (sl[i] & 1) == 0); } return sl; } @@ -6710,15 +6078,15 @@ RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) -static __hot void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { +__hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); } -static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { - mdbx_pnl_sort_nochk(pnl); - assert(mdbx_pnl_check(pnl, limit4check)); +static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { + pnl_sort_nochk(pnl); + assert(pnl_check(pnl, limit4check)); (void)limit4check; } @@ -6726,7 +6094,8 @@ static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -static __hot unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, pgno_t pgno) { +__hot __noinline static unsigned pnl_search_nochk(const MDBX_PNL pnl, + pgno_t pgno) { const pgno_t *begin = MDBX_PNL_BEGIN(pnl); const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); @@ -6738,53 +6107,53 @@ static __hot unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, pgno_t pgno) { return (unsigned)(it - begin + 1); } -static __inline unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno, - size_t limit) { - assert(mdbx_pnl_check4assert(pnl, limit)); +static __inline unsigned pnl_search(const MDBX_PNL pnl, pgno_t pgno, + size_t limit) { + assert(pnl_check_allocated(pnl, limit)); assert(pgno < limit); (void)limit; - return mdbx_pnl_search_nochk(pnl, pgno); + return pnl_search_nochk(pnl, pgno); } -static __inline unsigned mdbx_search_spilled(const MDBX_txn *txn, pgno_t pgno) { +static __inline unsigned search_spilled(const MDBX_txn *txn, pgno_t pgno) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return 0; pgno <<= 1; - unsigned n = mdbx_pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); + unsigned n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; } -static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { +static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, + unsigned npages) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return false; const unsigned len = MDBX_PNL_SIZE(pnl); - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL len %u [", len); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL len %u [", len); for (unsigned i = 1; i <= len; ++i) - mdbx_debug_extra_print(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) - : (long)(pnl[i] >> 1)); - mdbx_debug_extra_print("%s\n", "]"); + DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) + : (long)(pnl[i] >> 1)); + DEBUG_EXTRA_PRINT("%s\n", "]"); } const pgno_t spilled_range_begin = pgno << 1; const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; #if MDBX_PNL_ASCENDING const unsigned n = - mdbx_pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last; #else const unsigned n = - mdbx_pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n])); const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin; #endif - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; for (unsigned i = 0; i < npages; ++i) - check |= mdbx_search_spilled(txn, pgno + i) != 0; + check |= search_spilled(txn, pgno + i) != 0; assert(check == rc); } return rc; @@ -6792,7 +6161,7 @@ static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, /*----------------------------------------------------------------------------*/ -static __always_inline size_t txl2bytes(const size_t size) { +static __always_inline size_t txl_size2bytes(const size_t size) { assert(size > 0 && size <= MDBX_TXL_MAX * 2); size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), @@ -6801,20 +6170,20 @@ static __always_inline size_t txl2bytes(const size_t size) { return bytes; } -static __always_inline size_t bytes2txl(const size_t bytes) { +static __always_inline size_t txl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(txnid_t); assert(size > 2 && size <= MDBX_TXL_MAX * 2); return size - 2; } -static MDBX_TXL mdbx_txl_alloc(void) { - size_t bytes = txl2bytes(MDBX_TXL_INITIAL); - MDBX_TXL tl = mdbx_malloc(bytes); +static MDBX_TXL txl_alloc(void) { + size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL); + MDBX_TXL tl = osal_malloc(bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - tl[0] = bytes2txl(bytes); + tl[0] = txl_bytes2size(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); tl[1] = 0; tl += 1; @@ -6822,12 +6191,12 @@ static MDBX_TXL mdbx_txl_alloc(void) { return tl; } -static void mdbx_txl_free(MDBX_TXL tl) { +static void txl_free(MDBX_TXL tl) { if (likely(tl)) - mdbx_free(tl - 1); + osal_free(tl - 1); } -static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { +static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); @@ -6835,20 +6204,20 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { - mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); + ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) ? wanna + wanna - allocated : MDBX_TXL_MAX; - size_t bytes = txl2bytes(size); - MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes); + size_t bytes = txl_size2bytes(size); + MDBX_TXL tl = osal_realloc(*ptl - 1, bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - *tl = bytes2txl(bytes); + *tl = txl_bytes2size(bytes); assert(*tl >= wanna); *ptl = tl + 1; return MDBX_SUCCESS; @@ -6856,18 +6225,17 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, - size_t num) { +static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, + size_t num) { assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) - ? MDBX_SUCCESS - : mdbx_txl_reserve(ptl, wanna); + return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS + : txl_reserve(ptl, wanna); } -static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { +static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); MDBX_PNL_SIZE(tl) += 1; MDBX_PNL_LAST(tl) = id; @@ -6875,17 +6243,17 @@ static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { #define TXNID_SORT_CMP(first, last) ((first) > (last)) SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) -static void mdbx_txl_sort(MDBX_TXL tl) { +static void txl_sort(MDBX_TXL tl) { txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } -static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { +static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { - int rc = mdbx_txl_need(ptl, MDBX_TXL_GRANULATE); + int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_txl_xappend(*ptl, id); + txl_xappend(*ptl, id); return MDBX_SUCCESS; } @@ -6897,7 +6265,7 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { #define MDBX_DPL_RESERVE_GAP \ (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) -static __always_inline size_t dpl2bytes(ptrdiff_t size) { +static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); #if MDBX_DPL_PREALLOC_FOR_RADIXSORT size += size; @@ -6916,7 +6284,7 @@ static __always_inline size_t dpl2bytes(ptrdiff_t size) { return bytes; } -static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { +static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) { size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); @@ -6943,35 +6311,37 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) { static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); dl->sorted = dpl_setlen(dl, 0); + dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; dl->items[0].extra = 0; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static void mdbx_dpl_free(MDBX_txn *txn) { +static void dpl_free(MDBX_txn *txn) { if (likely(txn->tw.dirtylist)) { - mdbx_free(txn->tw.dirtylist); + osal_free(txn->tw.dirtylist); txn->tw.dirtylist = NULL; } } -static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) { - size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); - MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes); +static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { + size_t bytes = + dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); + MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); if (likely(dl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(dl); #endif /* malloc_usable_size */ - dl->detent = bytes2dpl(bytes); - mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); + dl->detent = dpl_bytes2size(bytes); + tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); txn->tw.dirtylist = dl; } return dl; } -static int mdbx_dpl_alloc(MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); +static int dpl_alloc(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) ? txn->mt_env->me_options.dp_initial : txn->mt_geo.upper; @@ -6983,7 +6353,7 @@ static int mdbx_dpl_alloc(MDBX_txn *txn) { (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) return MDBX_SUCCESS; } - if (unlikely(!mdbx_dpl_reserve(txn, wanna))) + if (unlikely(!dpl_reserve(txn, wanna))) return MDBX_ENOMEM; dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; @@ -6996,7 +6366,7 @@ RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) -__hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { +__hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); const unsigned unsorted = dl->length - dl->sorted; @@ -7017,19 +6387,23 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); dp_sort(tmp, tmp + unsorted); /* merge two parts from end to begin */ - MDBX_dp *w = dl->items + dl->length; - MDBX_dp *l = dl->items + dl->sorted; - MDBX_dp *r = end - 1; + MDBX_dp *__restrict w = dl->items + dl->length; + MDBX_dp *__restrict l = dl->items + dl->sorted; + MDBX_dp *__restrict r = end - 1; do { - const bool cmp = l->pgno > r->pgno; + const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV + *w = cmp ? *l-- : *r--; +#else *w = cmp ? *l : *r; l -= cmp; r += cmp - 1; +#endif } while (likely(--w > l)); assert(r == tmp - 1); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_assert_enabled()) + if (ASSERT_ENABLED()) for (unsigned i = 0; i <= dl->length; ++i) assert(dl->items[i].pgno < dl->items[i + 1].pgno); } else { @@ -7045,12 +6419,12 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { return dl; } -static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { +static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT); assert(dl->sorted <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return likely(dl->sorted == dl->length) ? dl : mdbx_dpl_sort_slowpath(txn); + return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); } /* Returns the index of the first dirty-page whose pgno @@ -7058,10 +6432,10 @@ static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { +__hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { assert(ptr[0].pgno < ptr[1].pgno); assert(ptr[0].pgno >= NUM_METAS); @@ -7071,7 +6445,7 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { switch (dl->length - dl->sorted) { default: /* sort a whole */ - mdbx_dpl_sort_slowpath(txn); + dpl_sort_slowpath(txn); break; case 0: /* whole sorted cases */ @@ -7083,22 +6457,13 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { return dl->length - N + 1; \ __fallthrough - /* try linear search until the threshold */ - LINEAR_SEARCH_CASE(16); /* fall through */ - LINEAR_SEARCH_CASE(15); /* fall through */ - LINEAR_SEARCH_CASE(14); /* fall through */ - LINEAR_SEARCH_CASE(13); /* fall through */ - LINEAR_SEARCH_CASE(12); /* fall through */ - LINEAR_SEARCH_CASE(11); /* fall through */ - LINEAR_SEARCH_CASE(10); /* fall through */ - LINEAR_SEARCH_CASE(9); /* fall through */ - LINEAR_SEARCH_CASE(8); /* fall through */ - LINEAR_SEARCH_CASE(7); /* fall through */ - LINEAR_SEARCH_CASE(6); /* fall through */ - LINEAR_SEARCH_CASE(5); /* fall through */ - LINEAR_SEARCH_CASE(4); /* fall through */ - LINEAR_SEARCH_CASE(3); /* fall through */ - LINEAR_SEARCH_CASE(2); /* fall through */ + /* use linear scan until the threshold */ + LINEAR_SEARCH_CASE(7); /* fall through */ + LINEAR_SEARCH_CASE(6); /* fall through */ + LINEAR_SEARCH_CASE(5); /* fall through */ + LINEAR_SEARCH_CASE(4); /* fall through */ + LINEAR_SEARCH_CASE(3); /* fall through */ + LINEAR_SEARCH_CASE(2); /* fall through */ case 1: if (dl->items[dl->length].pgno == pgno) return dl->length; @@ -7121,19 +6486,19 @@ dpl_endpgno(const MDBX_dpl *dl, unsigned i) { return dpl_npages(dl, i) + dl->items[i].pgno; } -static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { +static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, + unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - unsigned const n = mdbx_dpl_search(txn, pgno); + unsigned const n = dpl_search(txn, pgno); assert(n >= 1 && n <= dl->length + 1); assert(pgno <= dl->items[n].pgno); assert(pgno > dl->items[n - 1].pgno); const bool rc = /* intersection with founded */ pgno + npages > dl->items[n].pgno || /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; for (unsigned i = 1; i <= dl->length; ++i) { const MDBX_page *const dp = dl->items[i].ptr; @@ -7146,9 +6511,9 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { +static __always_inline unsigned dpl_exist(MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; - unsigned i = mdbx_dpl_search(txn, pgno); + unsigned i = dpl_search(txn, pgno); assert((int)i > 0); return (dl->items[i].pgno == pgno) ? i : 0; } @@ -7170,10 +6535,11 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, return nullptr; } -static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { +static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert((int)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + dl->pages_including_loose -= npages; dl->sorted -= dl->sorted >= i; dl->length -= 1; memmove(dl->items + i, dl->items + i + 1, @@ -7181,16 +6547,22 @@ static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static __always_inline int __must_check_result -mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { +static void dpl_remove(const MDBX_txn *txn, unsigned i) { + dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); +} + +static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, + pgno_t pgno, + MDBX_page *page, + unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (unsigned i = dl->length; i > 0; --i) { assert(dl->items[i].pgno != pgno); if (unlikely(dl->items[i].pgno == pgno)) { - mdbx_error("Page %u already exist in the DPL at %u", pgno, i); + ERROR("Page %u already exist in the DPL at %u", pgno, i); return MDBX_PROBLEM; } } @@ -7204,16 +6576,16 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { if (unlikely(dl->length == dl->detent)) { if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { - mdbx_error("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); + ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) ? dl->detent + dl->detent : dl->detent + dl->detent / 2; - dl = mdbx_dpl_reserve(txn, size); + dl = dpl_reserve(txn, size); if (unlikely(!dl)) return MDBX_ENOMEM; - mdbx_tassert(txn, dl->length < dl->detent); + tASSERT(txn, dl->length < dl->detent); } /* copy the stub beyond the end */ @@ -7225,11 +6597,12 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { dl->items[length].lru = txn->tw.dirtylru++; dl->length = length; dl->sorted = sorted; + dl->pages_including_loose += npages; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); return MDBX_SUCCESS; } -static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { +static __inline uint32_t dpl_age(const MDBX_txn *txn, unsigned i) { const MDBX_dpl *dl = txn->tw.dirtylist; assert((int)i > 0 && i <= dl->length); /* overflow could be here */ @@ -7238,30 +6611,27 @@ static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { /*----------------------------------------------------------------------------*/ -uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; -uint8_t mdbx_loglevel = MDBX_LOG_FATAL; -MDBX_debug_func *mdbx_debug_logger; +uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +uint8_t loglevel = MDBX_LOG_FATAL; +MDBX_debug_func *debug_logger; -static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc, - MDBX_page *mp); +static __must_check_result __inline int page_retire(MDBX_cursor *mc, + MDBX_page *mp); -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages); -struct page_result { +static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages); +typedef struct page_result { MDBX_page *page; int err; -}; +} pgr_t; -static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const pgno_t num, - int flags); -static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard); +static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages); -static int mdbx_page_touch(MDBX_cursor *mc); -static int mdbx_cursor_touch(MDBX_cursor *mc); -static int mdbx_touch_dbi(MDBX_cursor *mc); +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); +static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); +static int page_touch(MDBX_cursor *mc); +static int cursor_touch(MDBX_cursor *mc); +static int touch_dbi(MDBX_cursor *mc); #define MDBX_END_NAMES \ { \ @@ -7269,7 +6639,7 @@ static int mdbx_touch_dbi(MDBX_cursor *mc); "fail-beginchild" \ } enum { - /* mdbx_txn_end operation number, for logging */ + /* txn_end operation number, for logging */ MDBX_END_COMMITTED, MDBX_END_PURE_COMMIT, MDBX_END_ABORT, @@ -7278,148 +6648,150 @@ enum { MDBX_END_FAIL_BEGIN, MDBX_END_FAIL_BEGINCHILD }; -#define MDBX_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ #define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ #define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); +static int txn_end(MDBX_txn *txn, const unsigned mode); -__hot static struct page_result __must_check_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, txnid_t front); -static __always_inline int __must_check_result mdbx_page_get(MDBX_cursor *mc, - pgno_t pgno, - MDBX_page **mp, - txnid_t front) { +static __always_inline pgr_t page_get_inline(const uint16_t ILL, + MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front); - struct page_result ret = mdbx_page_get_ex(mc, pgno, front); +static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS, mc, pgno, front); +} + +__hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); +} + +static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, + front); +} + +static __always_inline int __must_check_result page_get(MDBX_cursor *mc, + const pgno_t pgno, + MDBX_page **mp, + const txnid_t front) { + pgr_t ret = page_get_three(mc, pgno, front); *mp = ret.page; return ret.err; } -static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, - const MDBX_val *key, - int flags); +static int __must_check_result page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags); #define MDBX_PS_MODIFY 1 #define MDBX_PS_ROOTONLY 2 #define MDBX_PS_FIRST 4 #define MDBX_PS_LAST 8 -static int __must_check_result mdbx_page_search(MDBX_cursor *mc, - const MDBX_val *key, int flags); -static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, - MDBX_cursor *cdst); +static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key, + int flags); +static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ -static int __must_check_result mdbx_page_split(MDBX_cursor *mc, - const MDBX_val *const newkey, - MDBX_val *const newdata, - pgno_t newpgno, unsigned nflags); +static int __must_check_result page_split(MDBX_cursor *mc, + const MDBX_val *const newkey, + MDBX_val *const newdata, + pgno_t newpgno, const unsigned naf); -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, - bool report); -static int __must_check_result mdbx_validate_meta_copy(MDBX_env *env, - const MDBX_meta *meta, - MDBX_meta *dest); -static int __must_check_result mdbx_override_meta(MDBX_env *env, - unsigned target, - txnid_t txnid, - const MDBX_meta *shape); -static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, - const int lck_exclusive, - const mdbx_mode_t mode_bits); -static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending); -static int mdbx_env_close0(MDBX_env *env); +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report); +static int __must_check_result validate_meta_copy(MDBX_env *env, + const MDBX_meta *meta, + MDBX_meta *dest); +static int __must_check_result override_meta(MDBX_env *env, unsigned target, + txnid_t txnid, + const MDBX_meta *shape); +static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, + const int lck_exclusive, + const mdbx_mode_t mode_bits); +static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending, + meta_troika_t *const troika); +static int env_close(MDBX_env *env); struct node_result { MDBX_node *node; bool exact; }; -static struct node_result mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key); +static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno); -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags); -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key); +static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + pgno_t pgno); +static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + MDBX_val *data, unsigned flags); +static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key); -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); -static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, - MDBX_cursor *cdst, bool fromleft); -static int __must_check_result mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *leaf, - MDBX_val *data, - const txnid_t front); -static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); -static int __must_check_result mdbx_update_key(MDBX_cursor *mc, - const MDBX_val *key); +static void node_del(MDBX_cursor *mc, size_t ksize); +static void node_shrink(MDBX_page *mp, unsigned indx); +static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, + bool fromleft); +static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, + MDBX_val *data, const MDBX_page *mp); +static int __must_check_result rebalance(MDBX_cursor *mc); +static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); -static void mdbx_cursor_pop(MDBX_cursor *mc); -static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); +static void cursor_pop(MDBX_cursor *mc); +static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); -static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, - unsigned retired_stored, - bool dont_filter_gc); +static int __must_check_result audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc); -static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, - unsigned options); -static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc, - unsigned options); -static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); -static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, - const MDBX_val *data, unsigned flags); +static int __must_check_result page_check(MDBX_cursor *const mc, + const MDBX_page *const mp); +static int __must_check_result cursor_check(MDBX_cursor *mc); +static int __must_check_result cursor_check_updating(MDBX_cursor *mc); +static int __must_check_result cursor_del(MDBX_cursor *mc); +static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, + const MDBX_val *data, unsigned flags); #define SIBLING_LEFT 0 #define SIBLING_RIGHT 2 -static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); -static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); +static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); +static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); struct cursor_set_result { int err; bool exact; }; -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); -static int __must_check_result mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); +static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); +static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); -static int __must_check_result mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, - MDBX_dbi dbi); -static int __must_check_result mdbx_xcursor_init0(MDBX_cursor *mc); -static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, - MDBX_node *node, - const MDBX_page *mp); -static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, - MDBX_xcursor *src_mx, - bool new_dupdata); +static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn, + MDBX_dbi dbi); +static int __must_check_result cursor_xinit0(MDBX_cursor *mc); +static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp); +static int __must_check_result cursor_xinit2(MDBX_cursor *mc, + MDBX_xcursor *src_mx, + bool new_dupdata); static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); -static int __must_check_result mdbx_drop_tree(MDBX_cursor *mc, - const bool may_have_subDBs); -static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); -static int __must_check_result mdbx_setup_dbx(MDBX_dbx *const dbx, - const MDBX_db *const db, - const unsigned pagesize); +static int __must_check_result drop_tree(MDBX_cursor *mc, + const bool may_have_subDBs); +static int __must_check_result fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); +static int __must_check_result setup_dbx(MDBX_dbx *const dbx, + const MDBX_db *const db, + const unsigned pagesize); static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, cmp_int_unaligned, cmp_lenfast; @@ -7577,30 +6949,30 @@ const char *mdbx_strerror_ANSI2OEM(int errnum) { } #endif /* Bit of madness for Windows */ -__cold void mdbx_debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args) { - if (mdbx_debug_logger) - mdbx_debug_logger(level, function, line, fmt, args); +__cold void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args) { + if (debug_logger) + debug_logger(level, function, line, fmt, args); else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { int prefix_len = 0; char *prefix = nullptr; if (function && line > 0) - prefix_len = mdbx_asprintf(&prefix, "%s:%d ", function, line); + prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); else if (function) - prefix_len = mdbx_asprintf(&prefix, "%s: ", function); + prefix_len = osal_asprintf(&prefix, "%s: ", function); else if (line > 0) - prefix_len = mdbx_asprintf(&prefix, "%d: ", line); + prefix_len = osal_asprintf(&prefix, "%d: ", line); if (prefix_len > 0 && prefix) { OutputDebugStringA(prefix); - mdbx_free(prefix); + osal_free(prefix); } char *msg = nullptr; - int msg_len = mdbx_vasprintf(&msg, fmt, args); + int msg_len = osal_vasprintf(&msg, fmt, args); if (msg_len > 0 && msg) { OutputDebugStringA(msg); - mdbx_free(msg); + osal_free(msg); } } #else @@ -7616,11 +6988,11 @@ __cold void mdbx_debug_log_va(int level, const char *function, int line, } } -__cold void mdbx_debug_log(int level, const char *function, int line, - const char *fmt, ...) { +__cold void debug_log(int level, const char *function, int line, + const char *fmt, ...) { va_list args; va_start(args, fmt); - mdbx_debug_log_va(level, function, line, fmt, args); + debug_log_va(level, function, line, fmt, args); va_end(args); } @@ -7671,17 +7043,16 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, /*------------------------------------------------------------------------------ LY: debug stuff */ -static const char *mdbx_leafnode_type(MDBX_node *n) { +static const char *leafnode_type(MDBX_node *n) { static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(node_flags(n), F_BIGDATA) - ? ": overflow page" - : tp[F_ISSET(node_flags(n), F_DUPDATA)] - [F_ISSET(node_flags(n), F_SUBDATA)]; + return (node_flags(n) & F_BIGDATA) + ? ": large page" + : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)]; } /* Display all the keys in the page. */ -MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { +MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type; MDBX_node *node; @@ -7689,8 +7060,7 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { MDBX_val key; DKBUF; - switch (mp->mp_flags & - (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { + switch (PAGETYPE_WHOLE(mp)) { case P_BRANCH: type = "Branch page"; break; @@ -7707,26 +7077,26 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { type = "Leaf2 sub-page"; break; case P_OVERFLOW: - mdbx_verbose("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); + VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); return; case P_META: - mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, - unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); + VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); return; default: - mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); + VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); return; } nkeys = page_numkeys(mp); - mdbx_verbose("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); + VERBOSE("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ key.iov_len = nsize = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, i, nsize); total += nsize; - mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + VERBOSE("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } node = page_node(mp, i); @@ -7734,24 +7104,24 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { key.iov_base = node->mn_data; nsize = (unsigned)(NODESIZE + key.iov_len); if (IS_BRANCH(mp)) { - mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), - DKEY(&key)); + VERBOSE("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); total += nsize; } else { - if (F_ISSET(node_flags(node), F_BIGDATA)) + if (node_flags(node) & F_BIGDATA) nsize += sizeof(pgno_t); else nsize += (unsigned)node_ds(node); total += nsize; nsize += sizeof(indx_t); - mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), - mdbx_leafnode_type(node)); + VERBOSE("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + leafnode_type(node)); } total = EVEN(total); } - mdbx_verbose("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - page_room(mp)); + VERBOSE("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, + page_room(mp)); } /*----------------------------------------------------------------------------*/ @@ -7782,9 +7152,9 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ - mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ - mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ + cASSERT(&(mn), \ + mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + cASSERT(&(mn), !cursor_is_tracked(&(mn))); \ MDBX_cursor mc_dummy; \ MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ MDBX_cursor *tracked = &(mn); \ @@ -7803,25 +7173,25 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } /* Allocate memory for a page. * Re-use old malloc'ed pages first for singletons, otherwise just malloc. * Set MDBX_TXN_ERROR on failure. */ -static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { +static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { MDBX_env *env = txn->mt_env; MDBX_page *np = env->me_dp_reserve; size_t size = env->me_psize; if (likely(num == 1 && np)) { - mdbx_assert(env, env->me_dp_reserve_len > 0); + eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); @@ -7829,7 +7199,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); - np = mdbx_malloc(size); + np = osal_malloc(size); if (unlikely(!np)) { txn->mt_flags |= MDBX_TXN_ERROR; return np; @@ -7856,7 +7226,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { } /* Free a shadow dirty page */ -static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -7873,97 +7243,100 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { } else { /* large pages just get freed directly */ VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + osal_free(dp); } } /* Return all dirty pages to dpage list */ -static void mdbx_dlist_free(MDBX_txn *txn) { +static void dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; - for (unsigned i = 1; i <= dl->length; i++) { - MDBX_page *dp = dl->items[i].ptr; - mdbx_dpage_free(env, dp, dpl_npages(dl, i)); - } + for (unsigned i = 1; i <= dl->length; i++) + dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); dpl_clear(dl); } -static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { - mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0); +static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { + cASSERT(mc, (mc->mc_flags & C_SUB) != 0); MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); return couple->outer.mc_db; } -MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { +MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { const MDBX_dpl *const dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - mdbx_tassert(txn, txn->tw.dirtyroom + dl->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + dl->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); - if (!mdbx_audit_enabled()) + if (!AUDIT_ENABLED()) return true; - unsigned loose = 0; + unsigned loose = 0, pages = 0; for (unsigned i = dl->length; i > 0; --i) { const MDBX_page *const dp = dl->items[i].ptr; if (!dp) continue; - mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno); + tASSERT(txn, dp->mp_pgno == dl->items[i].pgno); if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - const uint32_t age = mdbx_dpl_age(txn, i); - mdbx_tassert(txn, age < UINT32_MAX / 3); + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); if (unlikely(age > UINT32_MAX / 3)) return false; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); if (dp->mp_flags == P_LOOSE) { loose += 1; } else if (unlikely(!IS_MODIFIABLE(txn, dp))) return false; const unsigned num = dpl_npages(dl, i); - mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num); + pages += num; + tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) return false; if (i < dl->sorted) { - mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); + tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) return false; } - const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, - txn->mt_next_pgno); - mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || - txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); + const unsigned rpa = + pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); + tASSERT(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || + txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const unsigned rpb = mdbx_pnl_search( - txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1, txn->mt_next_pgno); - mdbx_tassert(txn, rpa == rpb); + const unsigned rpb = pnl_search(txn->tw.reclaimed_pglist, + dp->mp_pgno + num - 1, txn->mt_next_pgno); + tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; } } - mdbx_tassert(txn, loose == txn->tw.loose_count); + tASSERT(txn, loose == txn->tw.loose_count); if (unlikely(loose != txn->tw.loose_count)) return false; + tASSERT(txn, pages == dl->pages_including_loose); + if (unlikely(pages != dl->pages_including_loose)) + return false; + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); - mdbx_tassert(txn, !dp); + tASSERT(txn, !dp); if (unlikely(dp)) return false; } @@ -7972,20 +7345,20 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { } #if MDBX_ENABLE_REFUND -static void mdbx_refund_reclaimed(MDBX_txn *txn) { +static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; const MDBX_PNL pnl = txn->tw.reclaimed_pglist; - mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); + tASSERT(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING unsigned i = MDBX_PNL_SIZE(pnl); - mdbx_tassert(txn, pnl[i] == next_pgno - 1); + tASSERT(txn, pnl[i] == next_pgno - 1); while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) ; MDBX_PNL_SIZE(pnl) = i; #else unsigned i = 1; - mdbx_tassert(txn, pnl[i] == next_pgno - 1); + tASSERT(txn, pnl[i] == next_pgno - 1); unsigned len = MDBX_PNL_SIZE(pnl); while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) ; @@ -7993,42 +7366,42 @@ static void mdbx_refund_reclaimed(MDBX_txn *txn) { for (unsigned move = 0; move < len; ++move) pnl[1 + move] = pnl[i + move]; #endif - mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); + VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - 1)); + tASSERT(txn, + pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1)); } -static void mdbx_refund_loose(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.loose_pages != nullptr); - mdbx_tassert(txn, txn->tw.loose_count > 0); +static void refund_loose(MDBX_txn *txn) { + tASSERT(txn, txn->tw.loose_pages != nullptr); + tASSERT(txn, txn->tw.loose_count > 0); MDBX_dpl *const dl = txn->tw.dirtylist; - mdbx_tassert(txn, dl->length >= txn->tw.loose_count); + tASSERT(txn, dl->length >= txn->tw.loose_count); pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; MDBX_PNL suitable = onstack; if (dl->length - dl->sorted > txn->tw.loose_count) { /* Dirty list is useless since unsorted. */ - if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { - suitable = mdbx_pnl_alloc(txn->tw.loose_count); + if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { + suitable = pnl_alloc(txn->tw.loose_count); if (unlikely(!suitable)) return /* this is not a reason for transaction fail */; } /* Collect loose-pages which may be refunded. */ - mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); + tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; unsigned w = 0; for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { - mdbx_tassert(txn, lp->mp_flags == P_LOOSE); - mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno); + tASSERT(txn, lp->mp_flags == P_LOOSE); + tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { - mdbx_tassert(txn, - w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) - : MDBX_PNL_ALLOCLEN(suitable))); + tASSERT(txn, + w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); suitable[++w] = lp->mp_pgno; most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } @@ -8037,14 +7410,14 @@ static void mdbx_refund_loose(MDBX_txn *txn) { if (most + 1 == txn->mt_next_pgno) { /* Sort suitable list and refund pages at the tail. */ MDBX_PNL_SIZE(suitable) = w; - mdbx_pnl_sort(suitable, MAX_PAGENO + 1); + pnl_sort(suitable, MAX_PAGENO + 1); /* Scanning in descend order */ const int step = MDBX_PNL_ASCENDING ? -1 : 1; const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; - mdbx_tassert(txn, suitable[begin] >= suitable[end - step]); - mdbx_tassert(txn, most == suitable[begin]); + tASSERT(txn, suitable[begin] >= suitable[end - step]); + tASSERT(txn, most == suitable[begin]); for (int i = begin + step; i != end; i += step) { if (suitable[i] != most - 1) @@ -8052,10 +7425,11 @@ static void mdbx_refund_loose(MDBX_txn *txn) { most -= 1; } const unsigned refunded = txn->mt_next_pgno - most; - mdbx_debug("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, - refunded, most, txn->mt_next_pgno); + DEBUG("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, + most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); txn->mt_next_pgno = most; @@ -8078,8 +7452,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } } dpl_setlen(dl, w); - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -8087,20 +7460,20 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } } else { /* Dirtylist is mostly sorted, just refund loose pages at the end. */ - mdbx_dpl_sort(txn); - mdbx_tassert(txn, dl->length < 2 || - dl->items[1].pgno < dl->items[dl->length].pgno); - mdbx_tassert(txn, dl->sorted == dl->length); + dpl_sort(txn); + tASSERT(txn, + dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); + tASSERT(txn, dl->sorted == dl->length); /* Scan dirtylist tail-forward and cutoff suitable pages. */ unsigned n; for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && dl->items[n].ptr->mp_flags == P_LOOSE; --n) { - mdbx_tassert(txn, n > 0); + tASSERT(txn, n > 0); MDBX_page *dp = dl->items[n].ptr; - mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno); - mdbx_tassert(txn, dp->mp_pgno == dl->items[n].pgno); + DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno); + tASSERT(txn, dp->mp_pgno == dl->items[n].pgno); txn->mt_next_pgno -= 1; } dpl_setlen(dl, n); @@ -8110,8 +7483,8 @@ static void mdbx_refund_loose(MDBX_txn *txn) { dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + dl->pages_including_loose -= refunded; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -8119,41 +7492,41 @@ static void mdbx_refund_loose(MDBX_txn *txn) { unlink_loose: for (MDBX_page **link = &txn->tw.loose_pages; *link;) { MDBX_page *dp = *link; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + tASSERT(txn, dp->mp_flags == P_LOOSE); if (txn->mt_next_pgno > dp->mp_pgno) { link = &dp->mp_next; } else { *link = dp->mp_next; if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, 1); + dpage_free(txn->mt_env, dp, 1); } } } } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (suitable != onstack) - mdbx_pnl_free(suitable); + pnl_free(suitable); txn->tw.loose_refund_wl = txn->mt_next_pgno; } -static bool mdbx_refund(MDBX_txn *txn) { +static bool txn_refund(MDBX_txn *txn) { const pgno_t before = txn->mt_next_pgno; if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) - mdbx_refund_loose(txn); + refund_loose(txn); while (true) { if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) break; - mdbx_refund_reclaimed(txn); + refund_reclaimed(txn); if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) break; const pgno_t memo = txn->mt_next_pgno; - mdbx_refund_loose(txn); + refund_loose(txn); if (memo == txn->mt_next_pgno) break; } @@ -8163,29 +7536,29 @@ static bool mdbx_refund(MDBX_txn *txn) { if (txn->tw.spill_pages) /* Squash deleted pagenums if we refunded any */ - mdbx_spill_purge(txn); + spill_purge(txn); return true; } #else /* MDBX_ENABLE_REFUND */ -static __inline bool mdbx_refund(MDBX_txn *txn) { +static __inline bool txn_refund(MDBX_txn *txn) { (void)txn; /* No online auto-compactification. */ return false; } #endif /* MDBX_ENABLE_REFUND */ -__cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - unsigned npages) { +__cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, + unsigned npages) { MDBX_env *const env = txn->mt_env; - mdbx_debug("kill %u page(s) %" PRIaPGNO, npages, pgno); - mdbx_assert(env, pgno >= NUM_METAS && npages); + DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); + eASSERT(env, pgno >= NUM_METAS && npages); if (!IS_FROZEN(txn, mp)) { const size_t bytes = pgno2bytes(env, npages); memset(mp, -1, bytes); mp->mp_pgno = pgno; if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); + osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { struct iovec iov[MDBX_COMMIT_PAGES]; iov[0].iov_len = env->me_psize; @@ -8195,29 +7568,28 @@ __cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { - mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, + osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, pgno2bytes(env, MDBX_COMMIT_PAGES)); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); n = 0; } } - mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); } } /* Remove page from dirty list */ -static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, - MDBX_page *const mp, - const unsigned npages) { - mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - mdbx_dpl_remove(txn, di); +static __inline void page_wash(MDBX_txn *txn, const unsigned di, + MDBX_page *const mp, const unsigned npages) { + tASSERT(txn, di && di <= txn->tw.dirtylist->length && + txn->tw.dirtylist->items[di].ptr == mp); + dpl_remove_ex(txn, di, npages); txn->tw.dirtyroom++; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = 0xFFFF; + mp->mp_flags = P_BAD; VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); if (txn->mt_flags & MDBX_WRITEMAP) { VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), @@ -8225,17 +7597,7 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); } else - mdbx_dpage_free(txn->mt_env, mp, npages); -} - -static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { - (void)txn; -#if MDBX_DISABLE_PAGECHECKS - (void)mp; - return 0; -#else - return /* maybe zero in legacy DB */ mp->mp_txnid; -#endif /* !MDBX_DISABLE_PAGECHECKS */ + dpage_free(txn->mt_env, mp, npages); } /* Retire, loosen or free a single page. @@ -8246,12 +7608,12 @@ static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, - MDBX_page *mp /* maybe null */, - int pagetype /* maybe unknown/zero */) { +static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + MDBX_page *mp /* maybe null */, + unsigned pageflags /* maybe unknown/zero */) { int rc; MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && PAGETYPE(mp) == pagetype)); + tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); /* During deleting entire subtrees, it is reasonable and possible to avoid * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: @@ -8267,40 +7629,41 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, unsigned di = 0, si = 0, npages = 1; bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { - if (mdbx_assert_enabled() && pagetype) { - MDBX_page *check; - rc = mdbx_page_get(mc, pgno, &check, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, (PAGETYPE(check) & ~P_LEAF2) == (pagetype & ~P_FROZEN)); - mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check)); + if (ASSERT_ENABLED() && pageflags) { + pgr_t check; + check = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(check.err != MDBX_SUCCESS)) + return check.err; + tASSERT(txn, + (check.page->mp_flags & ~P_LEAF2) == (pageflags & ~P_FROZEN)); + tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } - if (pagetype & P_FROZEN) { + if (pageflags & P_FROZEN) { is_frozen = true; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(scan, pgno)); - mdbx_tassert(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); + tASSERT(txn, !search_spilled(scan, pgno)); + tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); } } goto status_done; - } else if (pagetype && txn->tw.dirtylist) { - if ((di = mdbx_dpl_exist(txn, pgno)) != 0) { + } else if (pageflags && txn->tw.dirtylist) { + if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; - mdbx_tassert(txn, IS_MODIFIABLE(txn, mp)); + tASSERT(txn, IS_MODIFIABLE(txn, mp)); goto status_done; } - if ((si = mdbx_search_spilled(txn, pgno)) != 0) { + if ((si = search_spilled(txn, pgno)) != 0) { is_spilled = true; goto status_done; } for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) { + if (dpl_exist(parent, pgno)) { is_shadowed = true; goto status_done; } - if (mdbx_search_spilled(parent, pgno)) { + if (search_spilled(parent, pgno)) { is_spilled = true; goto status_done; } @@ -8309,11 +7672,12 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, goto status_done; } - rc = mdbx_page_get(mc, pgno, &mp, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype); - pagetype = PAGETYPE(mp); + pgr_t pg = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(pg.err != MDBX_SUCCESS)) + return pg.err; + mp = pg.page; + tASSERT(txn, !pageflags || mp->mp_flags == pageflags); + pageflags = mp->mp_flags; } is_frozen = IS_FROZEN(txn, mp); @@ -8322,49 +7686,49 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); is_shadowed = IS_SHADOWED(txn, mp); if (is_dirty) { - mdbx_tassert(txn, !is_spilled); - mdbx_tassert(txn, !mdbx_search_spilled(txn, pgno)); - mdbx_tassert(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || - (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !is_spilled); + tASSERT(txn, !search_spilled(txn, pgno)); + tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || + (txn->mt_flags & MDBX_WRITEMAP)); } else { - mdbx_tassert(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } - di = is_dirty ? mdbx_dpl_exist(txn, pgno) : 0; - si = is_spilled ? mdbx_search_spilled(txn, pgno) : 0; - mdbx_tassert(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); + di = is_dirty ? dpl_exist(txn, pgno) : 0; + si = is_spilled ? search_spilled(txn, pgno) : 0; + tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); } else { - mdbx_tassert(txn, !IS_MODIFIABLE(txn, mp)); - mdbx_tassert(txn, !IS_SPILLED(txn, mp)); - mdbx_tassert(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, !IS_MODIFIABLE(txn, mp)); + tASSERT(txn, !IS_SPILLED(txn, mp)); + tASSERT(txn, !IS_SHADOWED(txn, mp)); } status_done: - if (likely((pagetype & P_OVERFLOW) == 0)) { + if (likely((pageflags & P_OVERFLOW) == 0)) { STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = pagetype & P_BRANCH; + const bool is_branch = pageflags & P_BRANCH; if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0); + MDBX_db *outer = outer_db(mc); + cASSERT(mc, !is_branch || outer->md_branch_pages > 0); outer->md_branch_pages -= is_branch; - mdbx_cassert(mc, is_branch || outer->md_leaf_pages > 0); + cASSERT(mc, is_branch || outer->md_leaf_pages > 0); outer->md_leaf_pages -= 1 - is_branch; } - mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0); + cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0); mc->mc_db->md_branch_pages -= is_branch; - mdbx_cassert(mc, (pagetype & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); - mc->mc_db->md_leaf_pages -= (pagetype & P_LEAF) != 0; + cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); + mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; } else { npages = mp->mp_pages; - mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages); + cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); mc->mc_db->md_overflow_pages -= npages; } if (is_frozen) { retire: - mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno); - rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + DEBUG("retire %u page %" PRIaPGNO, npages, pgno); + rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); + tASSERT(txn, dirtylist_check(txn)); return rc; } @@ -8379,44 +7743,43 @@ status_done: * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "dirty"; /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + page_wash(txn, di, mp, npages); } else if (si) { /* Страница пролита в этой транзакции, т.е. она аллоцирована * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); } else if ((txn->mt_flags & MDBX_WRITEMAP)) { kind = "writemap"; - mdbx_tassert(txn, mp && IS_MODIFIABLE(txn, mp)); + tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной * из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "parent's"; - if (mdbx_assert_enabled() && mp) { + if (ASSERT_ENABLED() && mp) { kind = nullptr; for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_search_spilled(parent, pgno)) { + if (search_spilled(parent, pgno)) { kind = "parent-spilled"; - mdbx_tassert(txn, is_spilled); + tASSERT(txn, is_spilled); break; } if (mp == debug_dpl_find(parent, pgno)) { kind = "parent-dirty"; - mdbx_tassert(txn, !is_spilled); + tASSERT(txn, !is_spilled); break; } } - mdbx_tassert(txn, kind != nullptr); + tASSERT(txn, kind != nullptr); } - mdbx_tassert(txn, - is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); } - mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; - mdbx_refund(txn); + txn_refund(txn); return MDBX_SUCCESS; } @@ -8430,7 +7793,7 @@ status_done: txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { - mdbx_debug("loosen dirty page %" PRIaPGNO, pgno); + DEBUG("loosen dirty page %" PRIaPGNO, pgno); mp->mp_flags = P_LOOSE; mp->mp_next = txn->tw.loose_pages; txn->tw.loose_pages = mp; @@ -8460,16 +7823,16 @@ status_done: for (MDBX_txn *parent = txn->mt_parent; parent && (parent->mt_flags & MDBX_TXN_SPILLS); parent = parent->mt_parent) { - if (mdbx_intersect_spilled(parent, pgno, npages)) + if (intersect_spilled(parent, pgno, npages)) goto skip_invalidate; - if (mdbx_dpl_intersect(parent, pgno, npages)) + if (dpl_intersect(parent, pgno, npages)) goto skip_invalidate; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - mdbx_kill_page(txn, mp, pgno, npages); + kill_page(txn, mp, pgno, npages); if (!(txn->mt_flags & MDBX_WRITEMAP)) { VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); @@ -8480,21 +7843,20 @@ status_done: } skip_invalidate: /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + page_wash(txn, di, mp, npages); reclaim: - mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); + rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); return rc; } if (si) { /* Page ws spilled in this txn */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); /* Страница могла быть выделена и затем пролита в этой транзакции, * тогда её необходимо поместить в reclaimed-список. * Либо она могла быть выделена в одной из родительских транзакций и затем @@ -8502,7 +7864,7 @@ status_done: * retired-список для последующей фильтрации при коммите. */ for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) + if (dpl_exist(parent, pgno)) goto retire; } /* Страница точно была выделена в этой транзакции @@ -8512,15 +7874,15 @@ status_done: if (is_shadowed) { /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const MDBX_page *parent_dp = nullptr; /* Check parent(s)'s dirty lists. */ for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; parent = parent->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(parent, pgno)); + tASSERT(txn, !search_spilled(parent, pgno)); parent_dp = debug_dpl_find(parent, pgno); } - mdbx_tassert(txn, parent_dp && (!mp || parent_dp == mp)); + tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); } /* Страница была выделена в родительской транзакции и теперь может быть * использована повторно, но только внутри этой транзакции, либо дочерних. @@ -8537,11 +7899,11 @@ status_done: goto retire; } -static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { - return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, PAGETYPE(mp)); +static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { + return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } -struct mdbx_iov_ctx { +struct iov_ctx { unsigned iov_items; size_t iov_bytes; size_t iov_off; @@ -8550,8 +7912,7 @@ struct mdbx_iov_ctx { struct iovec iov[MDBX_COMMIT_PAGES]; }; -static __inline void mdbx_iov_init(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { +static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { ctx->flush_begin = MAX_PAGENO; ctx->flush_end = MIN_PAGENO; ctx->iov_items = 0; @@ -8560,39 +7921,37 @@ static __inline void mdbx_iov_init(MDBX_txn *const txn, (void)txn; } -static __inline void mdbx_iov_done(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, ctx->iov_items == 0); +static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { + tASSERT(txn, ctx->iov_items == 0); #if defined(__linux__) || defined(__gnu_linux__) MDBX_env *const env = txn->mt_env; - if (!(txn->mt_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version < 0x02060b00) + if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00) /* Linux kernels older than version 2.6.11 ignore the addr and nbytes * arguments, making this function fairly expensive. Therefore, the * whole cache is always flushed. */ - mdbx_flush_incoherent_mmap( + osal_flush_incoherent_mmap( env->me_map + pgno2bytes(env, ctx->flush_begin), pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); #endif /* Linux */ } -static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - mdbx_tassert(txn, ctx->iov_items > 0); +static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, ctx->iov_items > 0); MDBX_env *const env = txn->mt_env; int rc; if (likely(ctx->iov_items == 1)) { - mdbx_assert(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); - rc = mdbx_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, + eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); + rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, ctx->iov_off); } else { - rc = mdbx_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, + rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, ctx->iov_bytes); } if (unlikely(rc != MDBX_SUCCESS)) - mdbx_error("Write error: %s", mdbx_strerror(rc)); + ERROR("Write error: %s", mdbx_strerror(rc)); else { VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, ctx->iov_bytes); @@ -8616,15 +7975,14 @@ static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { while (likely(rc == MDBX_SUCCESS) && unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { if (!timestamp) { - timestamp = mdbx_osal_monotime(); - mdbx_iov_done(txn, ctx); - mdbx_warning( + timestamp = osal_monotime(); + iov_done(txn, ctx); + WARNING( "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); - } else if (unlikely(mdbx_osal_monotime() - timestamp > 65536 / 10)) { - mdbx_error( - "bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); + } else if (unlikely(osal_monotime() - timestamp > 65536 / 10)) { + ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); rc = MDBX_CORRUPTED; } #if defined(_WIN32) || defined(_WIN64) @@ -8637,19 +7995,17 @@ static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { usleep(42); #endif } - mdbx_dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); + dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } return rc; } -static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, +static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, unsigned npages) { MDBX_env *const env = txn->mt_env; - mdbx_tassert(txn, - dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); - mdbx_tassert(txn, IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, - !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); + tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, IS_MODIFIABLE(txn, dp)); + tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); ctx->flush_begin = (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; @@ -8659,24 +8015,24 @@ static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, env->me_lck->mti_unsynced_pages.weak += npages; if (IS_SHADOWED(txn, dp)) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; - mdbx_tassert(txn, IS_SPILLED(txn, dp)); + tASSERT(txn, IS_SPILLED(txn, dp)); const size_t size = pgno2bytes(env, npages); if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || ctx->iov_items == ARRAY_LENGTH(ctx->iov) || ctx->iov_bytes + size > MAX_WRITE) { if (ctx->iov_items) { - int err = mdbx_iov_write(txn, ctx); + int err = iov_write(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; #if defined(__linux__) || defined(__gnu_linux__) - if (mdbx_linux_kernel_version >= 0x02060b00) + if (linux_kernel_version >= 0x02060b00) /* Linux kernels older than version 2.6.11 ignore the addr and nbytes * arguments, making this function fairly expensive. Therefore, the * whole cache is always flushed. */ #endif /* Linux */ - mdbx_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, + osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, env->me_os_psize); } ctx->iov_off = pgno2bytes(env, dp->mp_pgno); @@ -8686,18 +8042,18 @@ static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, ctx->iov_items += 1; ctx->iov_bytes += size; } else { - mdbx_tassert(txn, txn->mt_flags & MDBX_WRITEMAP); + tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); } return MDBX_SUCCESS; } -static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, +static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, unsigned npages) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); pgno_t pgno = dp->mp_pgno; int err = iov_page(txn, ctx, dp, npages); if (likely(err == MDBX_SUCCESS)) { - err = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); + err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); #if MDBX_ENABLE_PGOP_STAT if (likely(err == MDBX_SUCCESS)) txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; @@ -8708,15 +8064,15 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { +static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { unsigned keep = 0; while (mc->mc_flags & C_INITIALIZED) { for (unsigned i = 0; i < mc->mc_snum; ++i) { const MDBX_page *mp = mc->mc_pg[i]; if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { - unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); + unsigned const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - mdbx_dpl_age(txn, n)) { + dpl_age(txn, n)) { txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; ++keep; } @@ -8729,14 +8085,14 @@ static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { return keep; } -static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0; +static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + unsigned keep = m0 ? cursor_keep(txn, m0) : 0; for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && txn->mt_dbs[i].md_root != P_INVALID) for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) if (mc != m0) - keep += mdbx_cursor_keep(txn, mc); + keep += cursor_keep(txn, mc); return keep; } @@ -8747,21 +8103,21 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; - const uint32_t age = mdbx_dpl_age(txn, i); + const uint32_t age = dpl_age(txn, i); const unsigned npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { - mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); return 256; } MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - mdbx_debug("skip %s %u page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" - : (dp->mp_flags & P_LOOSE) ? "loose" - : "parent-spilled", - npages, pgno); + DEBUG("skip %s %u page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) ? "loose" + : (dp->mp_flags & P_LOOSE) ? "loose" + : "parent-spilled", + npages, pgno); return 256; } @@ -8770,17 +8126,17 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, MDBX_txn *parent = txn->mt_parent; if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { do - if (mdbx_intersect_spilled(parent, pgno, npages)) { - mdbx_debug("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + if (intersect_spilled(parent, pgno, npages)) { + DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_SPILLED; return 256; } while ((parent = parent->mt_parent) != nullptr); } - mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); unsigned prio = age * reciprocal >> 24; - mdbx_tassert(txn, prio < 256); + tASSERT(txn, prio < 256); if (likely(npages == 1)) return prio = 256 - prio; @@ -8792,7 +8148,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, factor |= factor >> 16; factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; - mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); + tASSERT(txn, factor < 256 && factor < (256 - prio)); return prio = factor; } @@ -8814,8 +8170,8 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of mdbx_page_touch(). Such references are - * handled by mdbx_page_unspill(). + * going thru all of the work of page_touch(). Such references are + * handled by page_unspill(). * * Also note, we never spill DB root pages, nor pages of active cursors, * because we'll need these back again soon anyway. And in nested txns, @@ -8823,8 +8179,8 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * parent txn. That would alter the parent txns' data even though * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. */ -static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const unsigned need) { +static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const unsigned need) { #if xMDBX_DEBUG_SPILLING != 1 /* production mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) @@ -8850,12 +8206,12 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!wanna_spill) return MDBX_SUCCESS; - mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", - wanna_spill, txn->tw.dirtyroom, need); - mdbx_tassert(txn, txn->tw.dirtylist->length >= wanna_spill); + NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill, + txn->tw.dirtyroom, need); + tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); + struct iov_ctx ctx; + iov_init(txn, &ctx); int rc = MDBX_SUCCESS; if (txn->mt_flags & MDBX_WRITEMAP) { MDBX_dpl *const dl = txn->tw.dirtylist; @@ -8868,21 +8224,21 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, dl->items[++w] = dl->items[r]; else if (!MDBX_FAKE_SPILL_WRITEMAP) { rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); - mdbx_tassert(txn, rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); } } - mdbx_tassert(txn, span == r - 1 - w && w == txn->tw.loose_count); + tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count); dl->sorted = (dl->sorted == dl->length) ? w : 0; dpl_setlen(dl, w); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { MDBX_env *const env = txn->mt_env; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, + rc = osal_msync(&env->me_dxb_mmap, pgno_align2os_bytes(env, ctx.flush_begin), pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), MDBX_SYNC_NONE); @@ -8890,10 +8246,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, return rc; } - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); if (!txn->tw.spill_pages) { txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = mdbx_pnl_alloc(wanna_spill); + txn->tw.spill_pages = pnl_alloc(wanna_spill); if (unlikely(!txn->tw.spill_pages)) { rc = MDBX_ENOMEM; bailout: @@ -8902,27 +8258,27 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } } else { /* purge deleted slots */ - mdbx_spill_purge(txn); - rc = mdbx_pnl_reserve(&txn->tw.spill_pages, wanna_spill); + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); (void)rc /* ignore since the resulting list may be shorter - and mdbx_pnl_append() will increase pnl on demand */ + and pnl_append() will increase pnl on demand */ ; } /* Сортируем чтобы запись на диск была полее последовательна */ - MDBX_dpl *const dl = mdbx_dpl_sort(txn); + MDBX_dpl *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ - const unsigned unspillable = mdbx_txn_keep(txn, m0); + const unsigned unspillable = txn_keep(txn, m0); if (unspillable + txn->tw.loose_count >= dl->length) { #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - mdbx_error("all %u dirty pages are unspillable since referenced " - "by a cursor(s), use fewer cursors or increase " - "MDBX_opt_txn_dp_limit", - unspillable); + ERROR("all %u dirty pages are unspillable since referenced " + "by a cursor(s), use fewer cursors or increase " + "MDBX_opt_txn_dp_limit", + unspillable); goto done; } @@ -8952,11 +8308,11 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, /* get min/max of LRU-labels */ uint32_t age_max = 0; for (unsigned i = 1; i <= dl->length; ++i) { - const uint32_t age = mdbx_dpl_age(txn, i); + const uint32_t age = dpl_age(txn, i); age_max = (age_max >= age) ? age_max : age; } - mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); + VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ unsigned radix_counters[256], spillable = 0, spilled = 0; @@ -8985,10 +8341,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, break; } - mdbx_verbose("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " - "wanna_spill %u", - prio2spill, prio2adjacent, amount, spillable, wanna_spill); - mdbx_tassert(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + VERBOSE("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " + "wanna_spill %u", + prio2spill, prio2adjacent, amount, spillable, wanna_spill); + tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); unsigned prev_prio = 256; unsigned r, w, prio; @@ -9002,10 +8358,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prio <= prio2spill) { if (prev_prio < prio2adjacent && prev_prio > prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - dpl_npages(dl, w), dl->items[r - 1].pgno, - mdbx_dpl_age(txn, r - 1), prev_prio); + DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), + prev_prio); --w; rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, dpl_npages(dl, r - 1)); @@ -9014,8 +8370,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, ++spilled; } - mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, - dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, + dp->mp_pgno, dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -9024,9 +8380,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + DEBUG("co-spill %u next-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + npages, dp->mp_pgno, dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -9038,50 +8394,51 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, dl->items[++w] = dl->items[r]; } - mdbx_tassert(txn, spillable == 0 || spilled > 0); + tASSERT(txn, spillable == 0 || spilled > 0); while (r <= dl->length) dl->items[++w] = dl->items[r++]; - mdbx_tassert(txn, r - 1 - w == spilled); + tASSERT(txn, r - 1 - w == spilled); dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += spilled; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); - if (ctx.iov_items) - rc = mdbx_iov_write(txn, &ctx); + if (ctx.iov_items) { + /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(txn, &ctx); + } if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - mdbx_pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; - mdbx_notice("spilled %u dirty-entries, now have %u dirty-room", spilled, - txn->tw.dirtyroom); - mdbx_iov_done(txn, &ctx); + NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled, + txn->tw.dirtyroom); + iov_done(txn, &ctx); } else { - mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); + tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; - mdbx_notice( - "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, - dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), - spill_prio(txn, i, reciprocal)); + NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } } #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) - mdbx_error("dirty-list length: before %u, after %u, parent %i, loose %u; " - "needed %u, spillable %u; " - "spilled %u dirty-entries, now have %u dirty-room", - dl->length + spilled, dl->length, - (txn->mt_parent && txn->mt_parent->tw.dirtylist) - ? (int)txn->mt_parent->tw.dirtylist->length - : -1, - txn->tw.loose_count, need, spillable, spilled, - txn->tw.dirtyroom); - mdbx_ensure(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); + ERROR("dirty-list length: before %u, after %u, parent %i, loose %u; " + "needed %u, spillable %u; " + "spilled %u dirty-entries, now have %u dirty-room", + dl->length + spilled, dl->length, + (txn->mt_parent && txn->mt_parent->tw.dirtylist) + ? (int)txn->mt_parent->tw.dirtylist->length + : -1, + txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom); + ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); #endif /* xMDBX_DEBUG_SPILLING */ done: @@ -9091,8 +8448,8 @@ done: : MDBX_TXN_FULL; } -static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { +static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, + const MDBX_val *data) { MDBX_txn *txn = mc->mc_txn; /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ @@ -9119,7 +8476,7 @@ static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, mc->mc_txn->mt_env->debug_dirtied_act = 0; #endif /* xMDBX_DEBUG_SPILLING == 2 */ - return mdbx_txn_spill(txn, mc, need); + return txn_spill(txn, mc, need); } /*----------------------------------------------------------------------------*/ @@ -9140,52 +8497,96 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, #define METAPAGE(env, n) page_meta(pgno2page(env, n)) #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) -MDBX_NOTHROW_PURE_FUNCTION static __inline txnid_t -constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { - mdbx_memory_fence(mo_AcquireRelease, false); - txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); - mdbx_assert(env, a == b); - (void)env; - return (a == b) ? a : 0; +MDBX_NOTHROW_PURE_FUNCTION static txnid_t +constmeta_txnid(const MDBX_meta *meta) { + const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); + const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); + return likely(a == b) ? a : 0; } -static __inline txnid_t meta_txnid(const MDBX_env *env, - volatile const MDBX_meta *meta) { - (void)env; - mdbx_memory_fence(mo_AcquireRelease, false); - txnid_t a = unaligned_peek_u64_volatile(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64_volatile(4, &meta->mm_txnid_b); - return (a == b) ? a : 0; +typedef struct { + uint64_t txnid; + size_t is_steady; +} meta_snap_t; + +static __always_inline txnid_t +atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) { +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr, + mo_AcquireRelease); +#else + const uint32_t l = atomic_load32( + &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + const uint32_t h = atomic_load32( + &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + return (uint64_t)h << 32 | l; +#endif +} + +static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) { + txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a); + jitter4testing(true); + size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID; + jitter4testing(true); + if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b))) + txnid = is_steady = 0; + meta_snap_t r = {txnid, is_steady}; + return r; +} + +static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) { + return meta_snap(meta).txnid; } static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && - unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && + unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - unaligned_poke_u64(4, meta->mm_txnid_b, 0); - mdbx_memory_fence(mo_AcquireRelease, true); - unaligned_poke_u64(4, meta->mm_txnid_a, txnid); +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, + mo_AcquireRelease); + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - mdbx_jitter4testing(true); + jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); - unaligned_poke_u64(4, meta->mm_txnid_b, txnid); - mdbx_memory_fence(mo_AcquireRelease, true); +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, const txnid_t txnid) { - mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) || - meta >= METAPAGE_END(env)); + eASSERT(env, + !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env)); (void)env; /* update inconsistently since this function used ONLY for filling meta-image * for writing, but not the actual meta-page */ @@ -9205,198 +8606,285 @@ static __inline uint64_t meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -enum meta_choise_mode { prefer_last, prefer_steady }; +typedef struct { + txnid_t txnid; + union { + const volatile MDBX_meta *ptr_v; + const MDBX_meta *ptr_c; + }; + size_t is_steady; +} meta_ptr_t; -static __inline bool meta_ot(const enum meta_choise_mode mode, - const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); - const txnid_t txnid_a = meta_txnid(env, a); - mdbx_jitter4testing(true); - const txnid_t txnid_b = meta_txnid(env, b); - mdbx_jitter4testing(true); - const bool is_stead_b = META_IS_STEADY(b); - - if (mode == prefer_steady) { - mdbx_jitter4testing(true); - const bool is_stead_a = META_IS_STEADY(a); - if (is_stead_a != is_stead_b) - return is_stead_b; - } else { - mdbx_assert(env, mode == prefer_last); - } - if (txnid_a == txnid_b) - return is_stead_b; - return txnid_a < txnid_b; +static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { + eASSERT(env, n < NUM_METAS); + meta_ptr_t r; + meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); + r.txnid = snap.txnid; + r.is_steady = snap.is_steady; + return r; } -static bool meta_eq(const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); - const txnid_t txnid = meta_txnid(env, a); - if (!txnid || txnid != meta_txnid(env, b)) - return false; - - mdbx_jitter4testing(true); - if (META_IS_STEADY(a) != META_IS_STEADY(b)) - return false; - - mdbx_jitter4testing(true); - return true; +static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { + return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; } -static int meta_eq_mask(const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - int rc = meta_eq(env, m0, m1) ? 1 : 0; - if (meta_eq(env, m1, m2)) - rc += 2; - if (meta_eq(env, m2, m0)) - rc += 4; - return rc; +static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); } -static __inline volatile const MDBX_meta * -meta_recent(const enum meta_choise_mode mode, const MDBX_env *env, - volatile const MDBX_meta *a, volatile const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(mode, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); - return a_older_that_b ? b : a; +static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); } -static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, - const MDBX_meta *a, - const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(prefer_steady, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); - return a_older_that_b ? a : b; +static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static __inline volatile const MDBX_meta * -meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - volatile const MDBX_meta *head = meta_recent(mode, env, m0, m1); - head = meta_recent(mode, env, head, m2); - return head; +static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { - return meta_mostrecent(prefer_steady, env); +MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, + uint8_t c12, bool s0, bool s1, + bool s2) { + assert(c01 < 3 && c02 < 3 && c12 < 3); + /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ + const uint8_t recent = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) + ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail; + if (recent == 0) + tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent == 1) + tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * -constmeta_prefer_steady(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_steady, env); +static __inline void meta_troika_unpack(meta_troika_t *troika, + const uint8_t packed) { + troika->recent = (packed >> 2) & 3; + troika->prefer_steady = (packed >> 4) & 3; + troika->tail_and_flags = packed & 0xC3; } -static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { - return meta_mostrecent(prefer_last, env); +static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { + 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, + 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, + 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, + 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, + 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, + 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, + 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, + 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, + 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, + 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, + 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, + 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, + 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, + 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, + 210, 194, 225, 193, 210, 194}; + +__hot static meta_troika_t meta_tap(const MDBX_env *env) { + meta_snap_t snap; + meta_troika_t troika; + snap = meta_snap(METAPAGE(env, 0)); + troika.txnid[0] = snap.txnid; + troika.fsm = (uint8_t)snap.is_steady << 0; + snap = meta_snap(METAPAGE(env, 1)); + troika.txnid[1] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 1; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); + snap = meta_snap(METAPAGE(env, 2)); + troika.txnid[2] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 2; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); + troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); + + meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); + return troika; } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * -constmeta_prefer_last(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_last, env); +static txnid_t recent_committed_txnid(const MDBX_env *env) { + const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); + const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); + const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); + return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); - if (likely(head == meta_prefer_last(env) && - recent == meta_txnid(env, head))) - return recent; - } +static __inline bool meta_eq(const meta_troika_t *troika, unsigned a, + unsigned b) { + assert(a < NUM_METAS && b < NUM_METAS); + return troika->txnid[a] == troika->txnid[b] && + (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && + troika->txnid[a]; } -static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_steady(env); - const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); - if (likely(head == meta_prefer_steady(env) && - recent == meta_txnid(env, head))) - return recent; - } +static unsigned meta_eq_mask(const meta_troika_t *troika) { + return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | + meta_eq(troika, 2, 0) << 2; } -static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { +__hot static bool meta_should_retry(const MDBX_env *env, + meta_troika_t *troika) { + const meta_troika_t prev = *troika; + *troika = meta_tap(env); + return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || + prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; +} + +static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, + const meta_troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->recent]; + r.ptr_v = METAPAGE(env, troika->recent); + r.is_steady = (troika->fsm >> troika->recent) & 1; + return r; +} + +static __always_inline meta_ptr_t +meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->prefer_steady]; + r.ptr_v = METAPAGE(env, troika->prefer_steady); + r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; + return r; +} + +static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, + const meta_troika_t *troika) { + const uint8_t tail = troika->tail_and_flags & 3; + meta_ptr_t r; + r.txnid = troika->txnid[tail]; + r.ptr_v = METAPAGE(env, tail); + r.is_steady = (troika->fsm >> tail) & 1; + return r; +} + +static const char *durable_caption(const volatile MDBX_meta *const meta) { if (META_IS_STEADY(meta)) - return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) == + return (unaligned_peek_u64_volatile(4, meta->mm_sign) == meta_sign((const MDBX_meta *)meta)) ? "Steady" : "Tainted"; return "Weak"; } +__cold static void meta_troika_dump(const MDBX_env *env, + const meta_troika_t *troika) { + const meta_ptr_t recent = meta_recent(env, troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); + const meta_ptr_t tail = meta_tail(env, troika); + NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + "head=%d-%" PRIaTXN ".%c, " + "base=%d-%" PRIaTXN ".%c, " + "tail=%d-%" PRIaTXN ".%c, " + "valid %c, strict %c", + troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], + (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], + (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, + recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, + prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', + troika->tail_and_flags % NUM_METAS, tail.txnid, + tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', + TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); +} + /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - MDBX_env *env = txn->mt_env; - const txnid_t edge = mdbx_recent_steady_txnid(env); - mdbx_tassert(txn, edge <= txn->mt_txnid); +static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { + const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); + eASSERT(env, steady <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (unlikely(lck == NULL /* exclusive mode */)) { - mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); - return env->me_lck->mti_oldest_reader.weak = edge; + if (unlikely(lck == NULL /* exclusive without-lck mode */)) { + eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + return env->me_lck->mti_oldest_reader.weak = steady; } - const txnid_t last_oldest = + const txnid_t prev_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_tassert(txn, edge >= last_oldest); - if (likely(last_oldest == edge)) - return edge; + eASSERT(env, steady >= prev_oldest); - const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - const uint32_t snap_readers_refresh_flag = - atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease); - mdbx_jitter4testing(false); - if (snap_readers_refresh_flag == nothing_changed) - return last_oldest; + txnid_t new_oldest = prev_oldest; + while (new_oldest != steady && + nothing_changed != + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { + lck->mti_readers_refresh_flag.weak = nothing_changed; + jitter4testing(false); + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + new_oldest = steady; - txnid_t oldest = edge; - atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); - if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { - oldest = snap; - if (oldest == last_oldest) - return oldest; + for (unsigned i = 0; i < snap_nreaders; ++i) { + const mdbx_pid_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); + if (!pid) + continue; + jitter4testing(true); + + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely(rtxn < prev_oldest)) { + if (unlikely(nothing_changed == + atomic_load32(&lck->mti_readers_refresh_flag, + mo_AcquireRelease)) && + safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { + NOTICE("kick stuck reader[%u of %u].pid_%u %" PRIaTXN + " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, + i, snap_nreaders, pid, rtxn, prev_oldest, steady); + } + continue; + } + + if (rtxn < new_oldest) { + new_oldest = rtxn; + if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) + break; } } } - if (oldest != last_oldest) { - mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, - oldest); - mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak); - atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); + if (new_oldest != prev_oldest) { + VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); + eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); } - return oldest; + return new_oldest; +} + +static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { + return find_oldest_reader(txn->mt_env, + txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); } /* Find largest mvcc-snapshot still referenced. */ -__cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { +__cold static pgno_t find_largest_snapshot(const MDBX_env *env, + pgno_t last_used_page) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck != NULL /* exclusive mode */)) { + if (likely(lck != NULL /* check for exclusive without-lck mode */)) { + retry:; const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -9406,26 +8894,23 @@ __cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { mo_AcquireRelease) || snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; - if (largest < snap_pages && - atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= - /* ignore pending updates */ snap_txnid && - snap_txnid <= env->me_txn0->mt_txnid) - largest = snap_pages; + if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid) + last_used_page = snap_pages; } } } - return largest; + return last_used_page; } /* Add a page to the txn's dirty list */ -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages) { +__hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages) { #if xMDBX_DEBUG_SPILLING == 2 txn->mt_env->debug_dirtied_act += 1; - mdbx_ensure(txn->mt_env, - txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); - mdbx_ensure(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); + ENSURE(txn->mt_env, + txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); + ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); #endif /* xMDBX_DEBUG_SPILLING == 2 */ int rc; @@ -9433,35 +8918,34 @@ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { MDBX_page *loose = txn->tw.loose_pages; - mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); + rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned di = mdbx_dpl_search(txn, loose->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose); - mdbx_dpl_remove(txn, di); + unsigned di = dpl_search(txn, loose->mp_pgno); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); + dpl_remove(txn, di); txn->tw.loose_pages = loose->mp_next; txn->tw.loose_count--; txn->tw.dirtyroom++; if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, loose, 1); + dpage_free(txn->mt_env, loose, 1); } else { - mdbx_error("Dirtyroom is depleted, DPL length %u", - txn->tw.dirtylist->length); + ERROR("Dirtyroom is depleted, DPL length %u", txn->tw.dirtylist->length); if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, mp, npages); + dpage_free(txn->mt_env, mp, npages); return MDBX_TXN_FULL; } } - rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages); + rc = dpl_append(txn, mp->mp_pgno, mp, npages); if (unlikely(rc != MDBX_SUCCESS)) { bailout: txn->mt_flags |= MDBX_TXN_ERROR; return rc; } txn->tw.dirtyroom--; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); return MDBX_SUCCESS; } @@ -9495,11 +8979,10 @@ MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #if MDBX_ENABLE_MADVISE /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -__cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, - const bool enable, - const bool force_whole) { - mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); - mdbx_assert(env, (enable & 1) == (enable != 0)); +__cold static int set_readahead(MDBX_env *env, const pgno_t edge, + const bool enable, const bool force_whole) { + eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); + eASSERT(env, (enable & 1) == (enable != 0)); const bool toggle = force_whole || ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || !env->me_lck->mti_readahead_anchor; @@ -9515,12 +8998,12 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, length = (length < limit) ? length : limit; length -= offset; - mdbx_assert(env, 0 <= (intptr_t)length); + eASSERT(env, 0 <= (intptr_t)length); if (length == 0) return MDBX_SUCCESS; - mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", - bytes2pgno(env, offset), bytes2pgno(env, offset + length)); + NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), + bytes2pgno(env, offset + length)); #if defined(F_RDAHEAD) if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) @@ -9622,9 +9105,9 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, } #endif /* MDBX_ENABLE_MADVISE */ -__cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno, const bool implicit) { +__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, const pgno_t limit_pgno, + const bool implicit) { const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); const size_t prev_size = env->me_dxb_mmap.current; @@ -9633,22 +9116,22 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const void *const prev_addr = env->me_map; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ - mdbx_verbose("resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR, - prev_size, size_bytes, prev_limit, limit_bytes); + VERBOSE("resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + prev_size, size_bytes, prev_limit, limit_bytes); - mdbx_assert(env, limit_bytes >= size_bytes); - mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); - mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + eASSERT(env, limit_bytes >= size_bytes); + eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); + eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); unsigned mresize_flags = env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: * - to avoid collision between read and write txns around env->me_dbgeo; - * - to avoid attachment of new reading threads (see mdbx_rdt_lock); */ - mdbx_srwlock_AcquireExclusive(&env->me_remap_guard); + * - to avoid attachment of new reading threads (see osal_rdt_lock); */ + osal_srwlock_AcquireExclusive(&env->me_remap_guard); mdbx_handle_array_t *suspended = NULL; mdbx_handle_array_t array_onstack; int rc = MDBX_SUCCESS; @@ -9670,9 +9153,9 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); array_onstack.count = 0; suspended = &array_onstack; - rc = mdbx_suspend_threads_before_remap(env, &suspended); + rc = osal_suspend_threads_before_remap(env, &suspended); if (rc != MDBX_SUCCESS) { - mdbx_error("failed suspend-for-remap: errcode %d", rc); + ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP @@ -9681,7 +9164,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #else /* Windows */ /* Acquire guard to avoid collision between read and write txns * around env->me_dbgeo */ - int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); + int rc = osal_fastmutex_acquire(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (limit_bytes == env->me_dxb_mmap.limit && @@ -9691,7 +9174,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && lck && !implicit) { - int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; + int err = osal_rdt_lock(env) /* lock readers table until remap done */; if (unlikely(MDBX_IS_ERROR(err))) { rc = err; goto bailout; @@ -9700,14 +9183,14 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, /* looking for readers from this process */ const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - mdbx_assert(env, !implicit); + eASSERT(env, !implicit); mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; for (unsigned i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { /* the base address of the mapping can't be changed since * the other reader thread from this process exists. */ - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); break; } @@ -9719,7 +9202,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), MDBX_SYNC_NONE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9727,9 +9210,9 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #if MDBX_ENABLE_MADVISE if (size_bytes < prev_size) { - mdbx_notice("resize-MADV_%s %u..%u", - (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", - size_pgno, bytes2pgno(env, prev_size)); + NOTICE("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, + bytes2pgno(env, prev_size)); rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) @@ -9762,7 +9245,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, } #endif /* MDBX_ENABLE_MADVISE */ - rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { @@ -9776,15 +9259,15 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, || prev_size > size_bytes #endif /* Windows */ ; - rc = mdbx_set_readahead(env, size_pgno, readahead, force); + rc = set_readahead(env, size_pgno, readahead, force); } #endif /* MDBX_ENABLE_MADVISE */ bailout: if (rc == MDBX_SUCCESS) { - mdbx_assert(env, size_bytes == env->me_dxb_mmap.current); - mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); - mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); @@ -9796,15 +9279,15 @@ bailout: #endif /* MDBX_USE_VALGRIND */ } else { if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { - mdbx_error("failed resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + ERROR("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } else { - mdbx_warning("unable resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + WARNING("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } if (!env->me_dxb_mmap.address) { env->me_flags |= MDBX_FATAL_ERROR; @@ -9816,31 +9299,31 @@ bailout: #if defined(_WIN32) || defined(_WIN64) int err = MDBX_SUCCESS; - mdbx_srwlock_ReleaseExclusive(&env->me_remap_guard); + osal_srwlock_ReleaseExclusive(&env->me_remap_guard); if (suspended) { - err = mdbx_resume_threads_after_remap(suspended); + err = osal_resume_threads_after_remap(suspended); if (suspended != &array_onstack) - mdbx_free(suspended); + osal_free(suspended); } #else if (env->me_lck_mmap.lck && (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) - mdbx_rdt_unlock(env); - int err = mdbx_fastmutex_release(&env->me_remap_guard); + osal_rdt_unlock(env); + int err = osal_fastmutex_release(&env->me_remap_guard); #endif /* Windows */ if (err != MDBX_SUCCESS) { - mdbx_fatal("failed resume-after-remap: errcode %d", err); + FATAL("failed resume-after-remap: errcode %d", err); return MDBX_PANIC; } return rc; } -__cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno) { +__cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, + const pgno_t limit_pgno) { const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); - mdbx_assert(env, mapped_pgno >= used_pgno); - return mdbx_mapresize( + eASSERT(env, mapped_pgno >= used_pgno); + return map_resize( env, used_pgno, size_pgno, (size_pgno > mapped_pgno) ? limit_pgno @@ -9850,44 +9333,42 @@ __cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, true); } -static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta, mdbx_filehandle_t fd) { +static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, + MDBX_meta *const meta, mdbx_filehandle_t fd) { const uint64_t wipe = MDBX_DATASIGN_NONE; - if (unlikely(META_IS_STEADY(meta)) && - constmeta_txnid(env, meta) <= last_steady) { - mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, - data_page(meta)->mp_pgno); + if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, + data_page(meta)->mp_pgno); if (env->me_flags & MDBX_WRITEMAP) - unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); + unaligned_poke_u64(4, meta->mm_sign, wipe); else - return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), - (uint8_t *)&meta->mm_datasync_sign - env->me_map); - if (constmeta_txnid(env, meta) == last_steady) - mdbx_assert(env, meta_checktxnid(env, meta, true)); + return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign), + (uint8_t *)&meta->mm_sign - env->me_map); } return MDBX_SUCCESS; } -__cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { +__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { + MDBX_env *const env = txn->mt_env; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) ? env->me_dsync_fd : env->me_lazy_fd; - int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); + int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); + err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); + err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); if (unlikely(err != MDBX_SUCCESS)) return err; if (env->me_flags & MDBX_WRITEMAP) { - mdbx_flush_incoherent_cpu_writeback(); - err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + osal_flush_incoherent_cpu_writeback(); + err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -9904,19 +9385,488 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { } if (syncfilerange_unavailable) #endif /* MDBX_USE_SYNCFILERANGE */ - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); } /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + txn->tw.troika = meta_tap(env); + for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) + if (scan != txn) + scan->tw.troika = txn->tw.troika; return MDBX_SUCCESS; } +//------------------------------------------------------------------------------ + +MDBX_MAYBE_UNUSED __hot static pgno_t * +scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING + assert(range[-1] == len); + const pgno_t *const detent = range + len - seq; + const ptrdiff_t offset = (ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[offset + 0] - range[0]; + const pgno_t diff1 = range[offset + 1] - range[1]; + const pgno_t diff2 = range[offset + 2] - range[2]; + const pgno_t diff3 = range[offset + 3] - range[3]; + if (diff0 == target) + return range + 0; + if (diff1 == target) + return range + 1; + if (diff2 == target) + return range + 2; + if (diff3 == target) + return range + 3; + range += 4; + } while (range + 3 < detent); + if (range == detent) + return nullptr; + } + do + if (range[offset] - *range == target) + return range; + while (++range < detent); +#else + assert(range[-(ptrdiff_t)len] == len); + const pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[-0] - range[offset - 0]; + const pgno_t diff1 = range[-1] - range[offset - 1]; + const pgno_t diff2 = range[-2] - range[offset - 2]; + const pgno_t diff3 = range[-3] - range[offset - 3]; + /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору + * загружать и вычислять все значения параллельно. */ + if (diff0 == target) + return range - 0; + if (diff1 == target) + return range - 1; + if (diff2 == target) + return range - 2; + if (diff3 == target) + return range - 3; + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range > detent); +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + +MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, + const unsigned seq) { + size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pnl); +#if MDBX_PNL_ASCENDING + while (seq <= MDBX_PNL_SIZE(pnl) - begin) { + if (pnl[begin + seq] - pnl[begin] == seq) + return pnl + begin; + ++begin; + } +#else + while (begin > seq) { + if (pnl[begin - seq] - pnl[begin] == seq) + return pnl + begin; + --begin; + } +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + +#if defined(_MSC_VER) && !defined(__builtin_clz) && \ + !__has_builtin(__builtin_clz) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { + unsigned long index; + _BitScanReverse(&index, value); + return index; +} +#endif /* _MSC_VER */ + +#if defined(_MSC_VER) && !defined(__builtin_clzl) && \ + !__has_builtin(__builtin_clzl) +#define __builtin_clzl(value) __builtin_clz(value) +#endif /* _MSC_VER */ + +#if !defined(MDBX_ATTRIBUTE_TARGET) && \ + (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) +#endif /* MDBX_ATTRIBUTE_TARGET */ + +#if defined(__SSE2__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) +#define __SSE2__ +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") +#endif /* __SSE2__ */ + +#if defined(__AVX2__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") +#endif /* __AVX2__ */ + +#if defined(__AVX512BW__) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw") +#endif /* __AVX512BW__ */ + +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 +MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned +diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * +scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m128i pattern = _mm_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 3)) { + do { + mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 28 - __builtin_clz(mask); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = 0xF << extra; + mask &= diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m256i pattern) { + const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); + const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); + const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); + return _mm256_movemask_ps(*(const __m256 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * +scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m256i pattern = _mm256_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 7)) { + do { + mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 24 - __builtin_clz(mask); + } + range -= 8; + } while (range > detent + 7); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 8 - range); + assert(extra > 0 && extra < 8); + mask = 0xFF << extra; + mask &= diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned +diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, + const __m512i pattern) { + const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); + return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * +scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m512i pattern = _mm512_set1_epi32(target); + unsigned mask; + if (likely(len > seq + 15)) { + do { + mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 16 - __builtin_clz(mask); + } + range -= 16; + } while (range > detent + 15); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 16 - range); + assert(extra > 0 && extra < 16); + mask = 0xFFFF << extra; + mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 7 > detent) { + mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); + if (mask) + return range + 24 - __builtin_clz(mask); + range -= 8; + } + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, + const ptrdiff_t offset, + const uint32x4_t pattern) { + const uint32x4_t f = vld1q_u32(ptr); + const uint32x4_t l = vld1q_u32(ptr + offset); + const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); + if (sizeof(size_t) > 7) + return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); + else + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), + 0); +} + +__hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, + const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const uint32x4_t pattern = vmovq_n_u32(target); + size_t mask; + if (likely(len > seq + 3)) { + do { + mask = diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return (pgno_t *)((char *)range - + (__builtin_clzl(mask) >> sizeof(size_t) / 4)); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); + mask &= diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* __ARM_NEON || __ARM_NEON__ */ + +#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) +#define scan4seq_default scan4seq_avx512bw +#define scan4seq scan4seq_default +#elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) +#define scan4seq_default scan4seq_avx2 +#elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) +#define scan4seq_default scan4seq_sse2 +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define scan4seq_default scan4seq_neon +/* Choosing of another variants should be added here. */ +#endif /* scan4seq_default */ + +#ifndef scan4seq_default +#define scan4seq_default scan4seq_fallback +#endif /* scan4seq_default */ + +#ifdef scan4seq +/* The scan4seq() is the best or no alternatives */ +#else +#if !(__has_builtin(__builtin_cpu_supports) || \ + defined(__BUILTIN_CPU_SUPPORTS__) || \ + (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))) +/* The scan4seq_default() will be used since no cpu-features detection support + * from compiler. Please don't ask to implement cpuid-based detection and don't + * make such PRs. */ +#define scan4seq scan4seq_default +#else +/* Selecting the most appropriate implementation at runtime, + * depending on the available CPU features. */ +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const unsigned seq); +static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, + const unsigned seq) = scan4seq_resolver; + +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const unsigned seq) { + pgno_t *(*choice)(pgno_t * range, const size_t len, const unsigned seq) = + nullptr; +#if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ + __GNUC_PREREQ(4, 8) + __builtin_cpu_init(); +#endif /* __builtin_cpu_init() */ +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 + if (__builtin_cpu_supports("sse2")) + choice = scan4seq_sse2; +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 + if (__builtin_cpu_supports("avx2")) + choice = scan4seq_avx2; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW + if (__builtin_cpu_supports("avx512bw")) + choice = scan4seq_avx512bw; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + /* Choosing of another variants should be added here. */ + scan4seq = choice ? choice : scan4seq_default; + return scan4seq(range, len, seq); +} +#endif /* __has_builtin(__builtin_cpu_supports */ +#endif /* scan4seq */ + +//------------------------------------------------------------------------------ + /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * @@ -9933,28 +9883,27 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { * * Returns 0 on success, non-zero on failure.*/ -#define MDBX_ALLOC_CACHE 1 -#define MDBX_ALLOC_GC 2 -#define MDBX_ALLOC_NEW 4 +#define MDBX_ALLOC_GC 1 +#define MDBX_ALLOC_NEW 2 +#define MDBX_ALLOC_COALESCE 4 #define MDBX_ALLOC_SLOT 8 #define MDBX_ALLOC_FAKE 16 #define MDBX_ALLOC_NOLOG 32 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) +#define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -__hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, - const pgno_t num, int flags) { - struct page_result ret; +static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { + pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; - mdbx_assert(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); - mdbx_assert(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); + eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); + eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); - const unsigned coalesce_threshold = - env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; + const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold) - flags &= ~MDBX_COALESCE; + flags |= env->me_flags & MDBX_LIFORECLAIM; + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) + flags |= MDBX_ALLOC_COALESCE; if (unlikely( /* If mc is updating the GC, then the retired-list cannot play catch-up with itself by growing while trying to save it. */ @@ -9965,46 +9914,18 @@ __hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, /* If our dirty list is already full, we can't touch GC */ (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); } - if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { - /* If there are any loose pages, just use them */ - mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0); - if (likely(txn->tw.loose_pages)) { -#if MDBX_ENABLE_REFUND - if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { - mdbx_refund(txn); - if (unlikely(!txn->tw.loose_pages)) - goto no_loose; - } -#endif /* MDBX_ENABLE_REFUND */ - - ret.page = txn->tw.loose_pages; - txn->tw.loose_pages = ret.page->mp_next; - txn->tw.loose_count--; - mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc), - ret.page->mp_pgno); - mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno); - mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page), - page_space(txn->mt_env)); - ret.page->mp_txnid = txn->mt_front; - ret.err = MDBX_SUCCESS; - return ret; - } - } -#if MDBX_ENABLE_REFUND -no_loose: -#endif /* MDBX_ENABLE_REFUND */ - - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); - txnid_t oldest = 0, last = 0; + unsigned re_len = MDBX_PNL_SIZE(re_list); + pgno_t *range = nullptr; + txnid_t detent = 0, last = 0; +#if MDBX_ENABLE_PGOP_STAT + uint64_t timestamp = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ while (true) { /* hsr-kick retry loop */ MDBX_cursor_couple recur; @@ -10014,55 +9935,41 @@ no_loose: /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); - if (!(flags & (MDBX_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; - pgno = MDBX_PNL_LEAST(re_list); - if (likely(num == 1)) + eASSERT(env, + pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && + MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); + range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + tASSERT(txn, range == scan4range_checker(re_list, num - 1)); + if (likely(range)) { + pgno = *range; goto done; - - const unsigned wanna_range = num - 1; -#if MDBX_PNL_ASCENDING - mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); - while (true) { - unsigned range_end = range_begin + wanna_range; - if (re_list[range_end] - pgno == wanna_range) - goto done; - if (range_end == re_len) - break; - pgno = re_list[++range_begin]; } -#else - mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); - while (true) { - if (re_list[range_begin - wanna_range] - pgno == wanna_range) - goto done; - if (range_begin == wanna_range) - break; - pgno = re_list[--range_begin]; - } -#endif /* MDBX_PNL sort-order */ } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; - /* Prepare to fetch more and coalesce */ - oldest = (flags & MDBX_LIFORECLAIM) - ? mdbx_find_oldest(txn) - : atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease); - ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); + /* Prepare to fetch and coalesce */ +#if MDBX_ENABLE_PGOP_STAT + if (likely(timestamp == 0)) + timestamp = osal_monotime(); +#endif /* MDBX_ENABLE_PGOP_STAT */ + detent = txn_oldest_reader(txn) + 1; + + ret.err = cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; if (flags & MDBX_LIFORECLAIM) { /* Begin from oldest reader if any */ - if (oldest > MIN_TXNID) { - last = oldest - 1; + if (detent > MIN_TXNID) { + last = detent - 1; op = MDBX_SET_RANGE; } } else if (txn->tw.last_reclaimed) { @@ -10077,9 +9984,9 @@ no_loose: if (!(flags & MDBX_LIFORECLAIM)) { /* Do not try fetch more if the record will be too recent */ - if (op != MDBX_FIRST && ++last >= oldest) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) + if (op != MDBX_FIRST && ++last >= detent) { + detent = txn_oldest_reader(txn) + 1; + if (detent <= last) break; } } @@ -10088,10 +9995,10 @@ no_loose: if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - txnid_t snap = mdbx_find_oldest(txn); - if (oldest < snap) { - oldest = snap; - last = oldest - 1; + const txnid_t snap = txn_oldest_reader(txn); + if (unlikely(detent <= snap)) { + detent = snap + 1; + last = snap; key.iov_base = &last; key.iov_len = sizeof(last); op = MDBX_SET_RANGE; @@ -10104,20 +10011,14 @@ no_loose: goto fail; } - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { ret.err = MDBX_CORRUPTED; goto fail; } last = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(last < MIN_TXNID || last > MAX_TXNID)) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - if (oldest <= last) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) { + if (detent <= last) { + detent = txn_oldest_reader(txn) + 1; + if (detent <= last) { if (flags & MDBX_LIFORECLAIM) continue; break; @@ -10138,14 +10039,14 @@ no_loose: /* Reading next GC record */ MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; - if (unlikely((ret.err = mdbx_node_read( + if (unlikely((ret.err = node_read( &recur.outer, page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, pp_txnid4chk(mp, txn))) != MDBX_SUCCESS)) + &data, mp)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { ret.err = MDBX_ENOMEM; goto fail; @@ -10153,11 +10054,12 @@ no_loose: } /* Append PNL from GC record to tw.reclaimed_pglist */ - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); + cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); - if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { + tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } @@ -10171,83 +10073,79 @@ no_loose: txn->mt_next_pgno + (size_t)num) || gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid overflow the page list. + /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. * todo4recovery://erased_by_github/libmdbx/issues/123 */ - mdbx_notice("stop reclaiming to avoid PNL overflow: %u (current) + %u " - "(chunk) -> %u", - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + NOTICE("stop reclaiming to avoid PNL overflow: %u (current) + %u " + "(chunk) -> %u", + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); break; } - ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); + ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; re_list = txn->tw.reclaimed_pglist; /* Remember ID of GC record */ if (flags & MDBX_LIFORECLAIM) { - ret.err = mdbx_txl_append(&txn->tw.lifo_reclaimed, last); + ret.err = txl_append(&txn->tw.lifo_reclaimed, last); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; } txn->tw.last_reclaimed = last; - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); for (unsigned i = gc_len; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); - mdbx_debug_extra_print("%s\n", "."); + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); } /* Merge in descending sorted order */ - const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); - mdbx_pnl_xmerge(re_list, gc_pnl); - /* re-check to avoid duplicates */ - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { + pnl_merge(re_list, gc_pnl); + if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); re_len = MDBX_PNL_SIZE(re_list); - mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); + tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); if (MDBX_ENABLE_REFUND && re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ - mdbx_refund(txn); + txn_refund(txn); re_list = txn->tw.reclaimed_pglist; re_len = MDBX_PNL_SIZE(re_list); } /* Done for a kick-reclaim mode, actually no page needed */ if (unlikely(flags & MDBX_ALLOC_SLOT)) { - mdbx_debug("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); + DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); +#if MDBX_ENABLE_PGOP_STAT + eASSERT(env, timestamp != 0); + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ ret.err = MDBX_SUCCESS; ret.page = NULL; return ret; } /* Don't try to coalesce too much. */ - if (flags & MDBX_COALESCE) { - if (re_len /* current size */ > coalesce_threshold || - (re_len > prev_re_len && - re_len - prev_re_len /* delta from prev */ >= - coalesce_threshold / 2)) { - mdbx_trace("clear %s %s", "MDBX_COALESCE", "since got threshold"); - flags &= ~MDBX_COALESCE; - } + if (re_len /* current size */ > coalesce_threshold) { + if (flags & MDBX_ALLOC_COALESCE) + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); + flags &= ~MDBX_ALLOC_COALESCE; } } - if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_GC)) { - mdbx_debug_extra("clear %s and continue", "MDBX_COALESCE"); - flags &= ~MDBX_COALESCE; + if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { + DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE"); + flags &= ~MDBX_ALLOC_COALESCE; continue; } @@ -10259,21 +10157,20 @@ no_loose: * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - range_begin = 0; + range = nullptr; pgno = txn->mt_next_pgno; const size_t next = (size_t)pgno + num; if (flags & MDBX_ALLOC_GC) { - const MDBX_meta *const head = constmeta_prefer_last(env); - const MDBX_meta *const steady = constmeta_prefer_steady(env); + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); /* does reclaiming stopped at the last steady point? */ - if (head != steady && META_IS_STEADY(steady) && - oldest == constmeta_txnid(env, steady)) { - mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN - "-%s, oldest %" PRIaTXN, - constmeta_txnid(env, head), mdbx_durable_str(head), - constmeta_txnid(env, steady), mdbx_durable_str(steady), - oldest); + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); ret.err = MDBX_RESULT_TRUE; const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -10289,18 +10186,19 @@ no_loose: * AND auto-sync threshold it NOT specified */ if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && ((autosync_threshold | autosync_period) == 0 || - next >= steady->mm_geo.now)) { + next >= prefer_steady.ptr_c->mm_geo.now)) { /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode * without any auto-sync threshold(s). */ - ret.err = mdbx_wipe_steady(env, oldest); - mdbx_debug("gc-wipe-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); } else if ((flags & MDBX_ALLOC_NEW) == 0 || (autosync_threshold && atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period) || @@ -10308,21 +10206,18 @@ no_loose: (next >= txn->mt_end_pgno && (autosync_threshold | autosync_period) == 0)) { /* make steady checkpoint. */ - MDBX_meta meta = *head; - ret.err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); - mdbx_debug("gc-make-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + } + if (likely(ret.err != MDBX_RESULT_TRUE)) { + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + continue; } - if (ret.err == MDBX_SUCCESS) { - if (mdbx_find_oldest(txn) > oldest) - continue; - /* it is reasonable check/kick lagging reader(s) here, - * since we made a new steady point or wipe the last. */ - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; - } else if (unlikely(ret.err != MDBX_RESULT_TRUE)) - goto fail; } } @@ -10330,44 +10225,51 @@ no_loose: * at the end of database file. */ if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) goto done; - if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; + + if (flags & MDBX_ALLOC_GC) { + const txnid_t laggard = txn_oldest_reader(txn); + if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && + kick_longlived_readers(env, laggard) >= detent)) + continue; + } ret.err = MDBX_NOTFOUND; if (flags & MDBX_ALLOC_NEW) { ret.err = MDBX_MAP_FULL; if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { - mdbx_assert(env, next > txn->mt_end_pgno); + eASSERT(env, next > txn->mt_end_pgno); const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); size_t aligned = pgno_align2os_pgno( env, (pgno_t)(next + grow_step - next % grow_step)); if (aligned > txn->mt_geo.upper) aligned = txn->mt_geo.upper; - mdbx_assert(env, aligned > txn->mt_end_pgno); + eASSERT(env, aligned > txn->mt_end_pgno); - mdbx_verbose("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = mdbx_mapresize_implicit(env, txn->mt_next_pgno, - (pgno_t)aligned, txn->mt_geo.upper); + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->mt_end_pgno); + ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper); if (ret.err == MDBX_SUCCESS) { env->me_txn->mt_end_pgno = (pgno_t)aligned; goto done; } - mdbx_error("unable growth datafile to %zu pages (+%zu), errcode %d", - aligned, aligned - txn->mt_end_pgno, ret.err); + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->mt_end_pgno, ret.err); } else { - mdbx_notice("gc-alloc: next %zu > upper %" PRIaPGNO, next, - txn->mt_geo.upper); + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next, + txn->mt_geo.upper); } } fail: - mdbx_assert(env, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PGOP_STAT + if (timestamp) + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); int level; const char *what; if (likely(!(flags & MDBX_ALLOC_FAKE))) { @@ -10378,22 +10280,26 @@ no_loose: level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; } - if (mdbx_log_enabled(level)) - mdbx_debug_log(level, __func__, __LINE__, - "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, + flags, ret.err); - mdbx_assert(env, ret.err != MDBX_SUCCESS); + eASSERT(env, ret.err != MDBX_SUCCESS); ret.page = NULL; return ret; } done: - mdbx_assert(env, !(flags & MDBX_ALLOC_SLOT)); - mdbx_ensure(env, pgno >= NUM_METAS); + eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); + ENSURE(env, pgno >= NUM_METAS); +#if MDBX_ENABLE_PGOP_STAT + if (likely(timestamp)) + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ if (unlikely(flags & MDBX_ALLOC_FAKE)) { - mdbx_debug("return NULL-page for %u pages %s allocation", num, - "gc-slot/backlog"); + DEBUG("return NULL-page for %u pages %s allocation", num, + "gc-slot/backlog"); ret.page = NULL; ret.err = MDBX_SUCCESS; return ret; @@ -10401,37 +10307,35 @@ done: if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); - /* LY: reset no-access flag from mdbx_page_loose() */ VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); } else { - ret.page = mdbx_page_malloc(txn, num); + ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; goto fail; } } - if (range_begin) { - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - mdbx_tassert(txn, pgno < txn->mt_next_pgno); - mdbx_tassert(txn, pgno == re_list[range_begin]); + if (range) { + cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); + tASSERT(txn, pgno < txn->mt_next_pgno); + tASSERT(txn, pgno == *range); /* Cutoff allocated pages from tw.reclaimed_pglist */ #if MDBX_PNL_ASCENDING - for (unsigned i = range_begin + num; i <= re_len;) - re_list[range_begin++] = re_list[i++]; - MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; + for (const pgno_t *const end = re_list + re_len - num; range <= end; + ++range) + *range = range[num]; #else - MDBX_PNL_SIZE(re_list) = re_len -= num; - for (unsigned i = range_begin - num; i < re_len;) - re_list[++i] = re_list[++range_begin]; + for (const pgno_t *const end = re_list + re_len; ++range <= end;) + range[-(ptrdiff_t)num] = *range; #endif - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_PNL_SIZE(re_list) = re_len -= num; + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { txn->mt_next_pgno = pgno + num; - mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); + eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); } if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -10441,23 +10345,90 @@ done: ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; - if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) { + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { ret.page->mp_pages = num; ret.page->mp_flags = P_OVERFLOW; } - ret.err = mdbx_page_dirty(txn, ret.page, num); + ret.err = page_dirty(txn, ret.page, num); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return ret; } -/* Copy the used portions of a non-overflow page. */ -__hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, - size_t psize) { +__hot static pgr_t page_alloc(MDBX_cursor *mc) { + MDBX_txn *const txn = mc->mc_txn; + + /* If there are any loose pages, just use them */ + while (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND + if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { + txn_refund(txn); + if (!txn->tw.loose_pages) + break; + } +#endif /* MDBX_ENABLE_REFUND */ + + MDBX_page *page = txn->tw.loose_pages; + txn->tw.loose_pages = page->mp_next; + txn->tw.loose_count--; + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno); + tASSERT(txn, page->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, page->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); + page->mp_txnid = txn->mt_front; + pgr_t ret = {page, MDBX_SUCCESS}; + return ret; + } + + if (likely(!(mc->mc_flags & C_GCFREEZE))) { + MDBX_PNL pnl = txn->tw.reclaimed_pglist; + const unsigned len = MDBX_PNL_SIZE(pnl); + if (likely(len > 0)) { + MDBX_PNL_SIZE(pnl) = len - 1; +#if MDBX_PNL_ASCENDING + const pgno_t pgno = pnl[1]; + for (unsigned i = 1; i < len; ++i) + pnl[i] = pnl[i + 1]; +#else + const pgno_t pgno = pnl[len]; +#endif + + MDBX_env *const env = txn->mt_env; + pgr_t ret; + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); + } else { + ret.page = page_malloc(txn, 1); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + return ret; + } + } + + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); + + ret.err = page_dirty(txn, ret.page, 1); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + return ret; + } + } + + return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); +} + +/* Copy the used portions of a non-large/overflow page. */ +__hot static void page_copy(MDBX_page *dst, const MDBX_page *src, + size_t psize) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { @@ -10481,34 +10452,34 @@ __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, * * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. */ -static struct page_result __must_check_result -mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { - mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); - mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - mdbx_tassert(txn, IS_SPILLED(txn, mp)); +static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, + const MDBX_page *const mp) { + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, IS_SPILLED(txn, mp)); const MDBX_txn *scan = txn; - struct page_result ret; + pgr_t ret; do { - mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); - const unsigned si = mdbx_search_spilled(scan, mp->mp_pgno); + tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); + const unsigned si = search_spilled(scan, mp->mp_pgno); if (!si) continue; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - ret.page = mdbx_page_malloc(txn, npages); + ret.page = page_malloc(txn, npages); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; return ret; } - mdbx_page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); + page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); if (scan == txn) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ - ret.err = mdbx_page_dirty(txn, ret.page, npages); + ret.err = page_dirty(txn, ret.page, npages); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; #if MDBX_ENABLE_PGOP_STAT @@ -10519,11 +10490,11 @@ mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { return ret; } while (likely((scan = scan->mt_parent) != nullptr && (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); - mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN - " not found in the spill-list(s), current txn %" PRIaTXN - " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, - mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, - txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); + ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, + txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); ret.err = MDBX_PROBLEM; ret.page = NULL; return ret; @@ -10535,25 +10506,25 @@ mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { * [in] mc cursor pointing to the page to be touched * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_touch(MDBX_cursor *mc) { +__hot static int page_touch(MDBX_cursor *mc) { const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; MDBX_page *np; MDBX_txn *txn = mc->mc_txn; int rc; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_tassert(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_tassert(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - mdbx_tassert(txn, *couple->outer.mc_dbistate & DBI_DIRTY); + tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); } else { - mdbx_tassert(txn, *mc->mc_dbistate & DBI_DIRTY); + tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY); } - mdbx_tassert(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - mdbx_tassert(txn, !IS_OVERFLOW(mp)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, !IS_OVERFLOW(mp)); + tASSERT(txn, dirtylist_check(txn)); } if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) @@ -10561,20 +10532,20 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { if (IS_FROZEN(txn, mp)) { /* CoW the page */ - rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); + rc = pnl_need(&txn->tw.retired_pages, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const pgr_t par = page_alloc(mc); rc = par.err; np = par.page; if (unlikely(rc != MDBX_SUCCESS)) goto fail; const pgno_t pgno = np->mp_pgno; - mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), - mp->mp_pgno, pgno); - mdbx_tassert(txn, mp->mp_pgno != pgno); - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), + mp->mp_pgno, pgno); + tASSERT(txn, mp->mp_pgno != pgno); + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; @@ -10587,43 +10558,43 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; np->mp_txnid = txn->mt_front; } else if (IS_SPILLED(txn, mp)) { - struct page_result pur = mdbx_page_unspill(txn, mp); + pgr_t pur = page_unspill(txn, mp); np = pur.page; rc = pur.err; if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, np != nullptr); + tASSERT(txn, np != nullptr); goto done; } goto fail; } else { if (unlikely(!txn->mt_parent)) { - mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); rc = MDBX_PROBLEM; goto fail; } - mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->length <= - MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); + tASSERT(txn, + txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); /* No - copy it */ - np = mdbx_page_malloc(txn, 1); + np = page_malloc(txn, 1); if (unlikely(!np)) { rc = MDBX_ENOMEM; goto fail; } - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); /* insert a clone of parent's dirty page, so don't touch dirtyroom */ - rc = mdbx_page_dirty(txn, np, 1); + rc = page_dirty(txn, np, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -10664,8 +10635,7 @@ fail: return rc; } -__cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, - bool nonblock) { +__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { bool locked = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; @@ -10681,15 +10651,22 @@ retry:; goto bailout; } + const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); + meta_ptr_t head; + if (inside_txn | locked) + head = meta_recent(env, &env->me_txn0->tw.troika); + else { + const meta_troika_t troika = meta_tap(env); + head = meta_recent(env, &troika); + } const pgno_t unsynced_pages = atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t head_txnid = meta_txnid(env, head); - const uint32_t synched_meta_txnid_u32 = - atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); - if (unsynced_pages == 0 && synched_meta_txnid_u32 == (uint32_t)head_txnid && - META_IS_STEADY(head)) - goto bailout; + if (unsynced_pages == 0) { + const uint32_t synched_meta_txnid_u32 = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); + if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) + goto bailout; + } const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -10697,46 +10674,51 @@ retry:; atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - const bool inside_txn = (env->me_txn0->mt_owner == mdbx_thread_self()); if (!inside_txn) { if (!locked) { - int err; +#if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + int err; /* pre-sync to avoid latency for writer */ if (unsynced_pages > /* FIXME: define threshold */ 16 && (flags & MDBX_SAFE_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { /* Acquire guard to avoid collision with remap */ #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); #else - err = mdbx_fastmutex_acquire(&env->me_remap_guard); + err = osal_fastmutex_acquire(&env->me_remap_guard); if (unlikely(err != MDBX_SUCCESS)) return err; #endif - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - err = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); + const size_t usedbytes = + pgno_align2os_bytes(env, head.ptr_c->mm_geo.next); + err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #else - int unlock_err = mdbx_fastmutex_release(&env->me_remap_guard); + int unlock_err = osal_fastmutex_release(&env->me_remap_guard); if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) err = unlock_err; #endif } else - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; - /* pre-sync done */ +#if MDBX_ENABLE_PGOP_STAT wops = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + /* pre-sync done */ rc = MDBX_SUCCESS /* means "some data was synced" */; } @@ -10748,25 +10730,25 @@ retry:; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ + env->me_txn0->tw.troika = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); goto retry; } - env->me_txn0->mt_txnid = head_txnid; - mdbx_assert(env, head_txnid == meta_txnid(env, head)); - mdbx_assert(env, head_txnid == mdbx_recent_committed_txnid(env)); - mdbx_find_oldest(env->me_txn0); + eASSERT(env, head.txnid == recent_committed_txnid(env)); + env->me_txn0->mt_txnid = head.txnid; + txn_oldest_reader(env->me_txn0); flags |= MDBX_SHRINK_ALLOWED; } - mdbx_assert(env, inside_txn || locked); - mdbx_assert(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); + eASSERT(env, inside_txn || locked); + eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); - if (!META_IS_STEADY(head) || - ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, - data_page((const void *)head)->mp_pgno, mdbx_durable_str(head), - unsynced_pages); - MDBX_meta meta = *head; - rc = mdbx_sync_locked(env, flags, &meta); + if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, + data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), + unsynced_pages); + MDBX_meta meta = *head.ptr_c; + rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -10774,17 +10756,17 @@ retry:; /* LY: sync meta-pages if MDBX_NOMETASYNC enabled * and someone was not synced above. */ if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head_txnid) { + (uint32_t)head.txnid) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, + ? osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA | MDBX_SYNC_IODQ) - : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) - atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, + atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid, mo_Relaxed); } @@ -10802,7 +10784,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { return MDBX_EBADSIGN; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -10814,7 +10796,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { if (wanna_active) { if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) return MDBX_EPERM; - mdbx_assert(env, env->me_map != nullptr); + eASSERT(env, env->me_map != nullptr); } return MDBX_SUCCESS; @@ -10825,7 +10807,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_env_sync_internal(env, force, nonblock); + return env_sync(env, force, nonblock); } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -10837,7 +10819,7 @@ __cold int mdbx_env_sync_poll(MDBX_env *env) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { +static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { for (int i = parent->mt_numdbs; --i >= 0;) { nested->mt_cursors[i] = NULL; MDBX_cursor *mc = parent->mt_cursors[i]; @@ -10848,7 +10830,7 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { bk = mc; if (mc->mc_signature != MDBX_MC_LIVE) continue; - bk = mdbx_malloc(size); + bk = osal_malloc(size); if (unlikely(!bk)) return MDBX_ENOMEM; #if MDBX_DEBUG @@ -10882,7 +10864,7 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { +static void cursors_eot(MDBX_txn *txn, const bool merge) { for (int i = txn->mt_numdbs; --i >= 0;) { MDBX_cursor *next, *mc = txn->mt_cursors[i]; if (!mc) @@ -10892,14 +10874,14 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { const unsigned stage = mc->mc_signature; MDBX_cursor *bk = mc->mc_backup; next = mc->mc_next; - mdbx_ensure(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + ENSURE(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + cASSERT(mc, mc->mc_dbi == (unsigned)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; - mdbx_cassert(mc, mx == bk->mc_xcursor); - mdbx_tassert(txn, txn->mt_parent != NULL); - mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, mx == bk->mc_xcursor); + tASSERT(txn, txn->mt_parent != NULL); + ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { @@ -10923,9 +10905,9 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; - mdbx_free(bk); + osal_free(bk); } else { - mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } @@ -10935,7 +10917,7 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) /* Find largest mvcc-snapshot still referenced by this process. */ -static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { +static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* exclusive mode */)) { const unsigned snap_nreaders = @@ -10944,7 +10926,7 @@ static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == env->me_pid) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -10965,7 +10947,7 @@ static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { return largest; } -static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { +static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { #if !defined(__SANITIZE_ADDRESS__) if (!RUNNING_ON_VALGRIND) return; @@ -10981,10 +10963,9 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { } else { /* transaction end */ bool should_unlock = false; pgno_t last = MAX_PAGENO + 1; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) { + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ - const MDBX_meta *head = constmeta_prefer_last(env); - last = head->mm_geo.next; + last = meta_recent(env, &env->me_txn0->troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; @@ -10997,10 +10978,10 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { return; } - last = mdbx_find_largest_this(env, last); + last = find_largest_this(env, last); const pgno_t edge = env->me_poison_edge; if (edge > last) { - mdbx_assert(env, last >= NUM_METAS); + eASSERT(env, last >= NUM_METAS); env->me_poison_edge = last; VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), pgno2bytes(env, edge - last)); @@ -11019,28 +11000,28 @@ typedef struct { } bind_rslot_result; static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { - mdbx_assert(env, env->me_lck_mmap.lck); - mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); - mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); + eASSERT(env, env->me_lck_mmap.lck); + eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); - bind_rslot_result result = {mdbx_rdt_lock(env), nullptr}; + bind_rslot_result result = {osal_rdt_lock(env), nullptr}; if (unlikely(MDBX_IS_ERROR(result.err))) return result; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_PANIC; return result; } if (unlikely(!env->me_map)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_EPERM; return result; } if (unlikely(env->me_live_reader != env->me_pid)) { - result.err = mdbx_rpid_set(env); + result.err = osal_rpid_set(env); if (unlikely(result.err != MDBX_SUCCESS)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); return result; } env->me_live_reader = env->me_pid; @@ -11049,18 +11030,18 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { result.err = MDBX_SUCCESS; unsigned slot, nreaders; while (1) { - nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed); + nreaders = env->me_lck->mti_numreaders.weak; for (slot = 0; slot < nreaders; slot++) - if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) == - 0) + if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, + mo_AcquireRelease)) break; if (likely(slot < env->me_maxreaders)) break; - result.err = mdbx_cleanup_dead_readers(env, true, NULL); + result.err = cleanup_dead_readers(env, true, NULL); if (result.err != MDBX_RESULT_TRUE) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; return result; @@ -11073,17 +11054,16 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ - atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed); + atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) - atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed); - atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid, - mo_Relaxed); - atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed); - mdbx_rdt_unlock(env); + env->me_lck->mti_numreaders.weak = ++nreaders; + result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; + atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); + osal_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, env->me_live_reader == env->me_pid); + eASSERT(env, env->me_live_reader == env->me_pid); thread_rthc_set(env->me_txkey, result.rslot); } return result; @@ -11098,22 +11078,22 @@ __cold int mdbx_thread_register(const MDBX_env *env) { return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_EINVAL /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid)) return MDBX_BAD_RSLOT; return MDBX_RESULT_TRUE /* already registered */; } - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) return MDBX_TXN_OVERLAPPING; return bind_rslot((MDBX_env *)env, tid).err; @@ -11128,22 +11108,23 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_RESULT_TRUE; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid || - r->mr_tid.weak != mdbx_thread_self())) + r->mr_tid.weak != osal_thread_self())) return MDBX_BAD_RSLOT; + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; @@ -11155,45 +11136,43 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { } /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, - bool report) { - const txnid_t meta_txnid = constmeta_txnid(env, meta); - const txnid_t freedb_mod_txnid = meta->mm_dbs[FREE_DBI].md_mod_txnid; - const txnid_t maindb_mod_txnid = meta->mm_dbs[MAIN_DBI].md_mod_txnid; +static bool coherency_check(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, bool report) { + const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid; + const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid; - const pgno_t freedb_root_pgno = meta->mm_dbs[FREE_DBI].md_root; + const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root; const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno != P_INVALID) ? pgno2page(env, freedb_root_pgno) : nullptr; - const pgno_t maindb_root_pgno = meta->mm_dbs[MAIN_DBI].md_root; + const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root; const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID) ? pgno2page(env, maindb_root_pgno) : nullptr; - const uint64_t magic_and_version = - unaligned_peek_u64(4, &meta->mm_magic_and_version); + unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); + bool ok = true; - if (unlikely(meta_txnid < freedb_mod_txnid || + if (unlikely(txnid < freedb_mod_txnid || (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "free", freedb_mod_txnid, meta_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "free", freedb_mod_txnid, txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } - if (unlikely(meta_txnid < maindb_mod_txnid || + if (unlikely(txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "main", maindb_mod_txnid, meta_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "main", maindb_mod_txnid, txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } if (likely(freedb_root && freedb_mod_txnid)) { @@ -11203,10 +11182,10 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const txnid_t root_txnid = freedb_root->mp_txnid; if (unlikely(root_txnid != freedb_mod_txnid)) { if (report) - mdbx_warning( - "catch invalid root_page_txnid %" PRIaTXN - " for %sdb_mod_txnid %" PRIaTXN " %s", - root_txnid, "free", maindb_mod_txnid, + WARNING( + "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } @@ -11218,10 +11197,10 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const txnid_t root_txnid = maindb_root->mp_txnid; if (unlikely(root_txnid != maindb_mod_txnid)) { if (report) - mdbx_warning( - "catch invalid root_page_txnid %" PRIaTXN - " for %sdb_mod_txnid %" PRIaTXN " %s", - root_txnid, "main", maindb_mod_txnid, + WARNING( + "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } @@ -11229,21 +11208,16 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, return ok; } -/* check with timeout as the workaround - * for todo4recovery://erased_by_github/libmdbx/issues/269 */ -static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, - uint64_t *timestamp) { - if (likely(meta_checktxnid(env, (const MDBX_meta *)meta, !*timestamp))) - return MDBX_SUCCESS; - - if (!*timestamp) - *timestamp = mdbx_osal_monotime(); - else if (unlikely(mdbx_osal_monotime() - *timestamp > 65536 / 10)) { - mdbx_error("bailout waiting for valid snapshot %s", - "(workaround for incoherent flaw of unified page/buffer cache)"); +__cold static int coherency_timeout(uint64_t *timestamp) { + if (likely(timestamp && *timestamp == 0)) + *timestamp = osal_monotime(); + else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); return MDBX_CORRUPTED; } + osal_memory_fence(mo_AcquireRelease, true); #if defined(_WIN32) || defined(_WIN64) SwitchToThread(); #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) @@ -11256,13 +11230,49 @@ static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, return MDBX_RESULT_TRUE; } +/* check with timeout as the workaround + * for todo4recovery://erased_by_github/libmdbx/issues/269 */ +__hot static int coherency_check_readed(const MDBX_env *env, + const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) + return coherency_timeout(timestamp); + return MDBX_SUCCESS; +} + +static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + const txnid_t head_txnid = meta_txnid(meta); + if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { + if (report) + WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", + (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, + bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), + "(workaround for incoherent flaw of unified page/buffer cache)"); + return coherency_timeout(timestamp); + } + return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); +} + +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report) { + uint64_t timestamp = 0; + return coherency_check_written(env, 0, meta, report ? ×tamp : nullptr) == + MDBX_SUCCESS; +} + /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { +static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_env *env = txn->mt_env; int rc; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -11281,28 +11291,28 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (flags & MDBX_TXN_RDONLY) { - mdbx_assert(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); txn->mt_flags = MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); + eASSERT(env, !(env->me_flags & MDBX_NOTLS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { if (unlikely(!r->mr_pid.weak) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); } } } else { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); } if (likely(r)) { @@ -11317,12 +11327,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } txn->to.reader = r; if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { - mdbx_assert(env, txn->mt_txnid == 0); - mdbx_assert(env, txn->mt_owner == 0); - mdbx_assert(env, txn->mt_numdbs == 0); + eASSERT(env, txn->mt_txnid == 0); + eASSERT(env, txn->mt_owner == 0); + eASSERT(env, txn->mt_numdbs == 0); if (likely(r)) { - mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0); - mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, r->mr_snapshot_pages_used.weak == 0); + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); } txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; @@ -11330,107 +11340,100 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Seek & fetch the last meta */ - if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { - uint64_t timestamp = 0; - while (1) { - volatile const MDBX_meta *const meta = meta_prefer_last(env); - mdbx_jitter4testing(false); - const txnid_t snap = meta_txnid(env, meta); - mdbx_jitter4testing(false); - if (likely(r)) { - safe64_reset(&r->mr_txnid, false); - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, - mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64_volatile(4, meta->mm_pages_retired), - mo_Relaxed); - safe64_write(&r->mr_txnid, snap); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == snap); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_AcquireRelease); - } else { - /* exclusive mode without lck */ - } - mdbx_jitter4testing(true); + uint64_t timestamp = 0; + unsigned loop = 0; + meta_troika_t troika = meta_tap(env); + while (1) { + const meta_ptr_t head = + likely(env->me_stuck_meta < 0) + ? /* regular */ meta_recent(env, &troika) + : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); + if (likely(r)) { + safe64_reset(&r->mr_txnid, false); + atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next, + mo_Relaxed); + atomic_store64( + &r->mr_snapshot_pages_retired, + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired), + mo_Relaxed); + safe64_write(&r->mr_txnid, head.txnid); + eASSERT(env, r->mr_pid.weak == osal_getpid()); + eASSERT(env, + r->mr_tid.weak == + ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); + eASSERT(env, r->mr_txnid.weak == head.txnid || + (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && + head.txnid < env->me_lck->mti_oldest_reader.weak)); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); + } else { + /* exclusive mode without lck */ + eASSERT(env, !env->me_lck_mmap.lck && + env->me_lck == (void *)&env->x_lckless_stub); + } + jitter4testing(true); - /* Snap the state from current meta-head */ - txn->mt_txnid = snap; - txn->mt_geo = meta->mm_geo; - STATIC_ASSERT(CORE_DBS == 2); - txn->mt_dbs[0] = meta->mm_dbs[0]; - txn->mt_dbs[1] = meta->mm_dbs[1]; - txn->mt_canary = meta->mm_canary; + /* Snap the state from current meta-head */ + txn->mt_txnid = head.txnid; + txn->mt_geo = head.ptr_v->mm_geo; + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = head.ptr_v->mm_canary; - /* LY: Retry on a race, ITS#7970. - * The barrier is not needed here since C11-atomics are used, - * but it is reasonable paranoia to avoid compiler misoptimization - * and makes clarity for code readers. */ - mdbx_compiler_barrier(); - if (likely(meta == meta_prefer_last(env) && - snap == meta_txnid(env, meta) && - snap >= atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease))) { - /* workaround for todo4recovery://erased_by_github/libmdbx/issues/269 - */ - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); - mdbx_jitter4testing(false); - if (likely(rc == MDBX_SUCCESS)) - break; - if (likely(rc == MDBX_RESULT_TRUE)) - continue; + if (unlikely(env->me_stuck_meta >= 0)) + break; + if (unlikely(meta_should_retry(env, &troika) || + head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, + mo_AcquireRelease))) { + if (unlikely(++loop > 42)) { + ERROR("bailout waiting for valid snapshot (%s)", + "metapages are too volatile"); + rc = MDBX_PROBLEM; + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); goto bailout; } + timestamp = 0; + continue; } - } else { - /* r/o recovery mode */ - MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta); - txn->mt_txnid = constmeta_txnid(env, meta); - txn->mt_geo = meta->mm_geo; - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - txn->mt_canary = meta->mm_canary; - if (likely(r)) { - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, - mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64(4, meta->mm_pages_retired), - mo_Relaxed); - atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_Relaxed); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_Relaxed); + + rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v, + ×tamp); + jitter4testing(false); + if (likely(rc == MDBX_SUCCESS)) + break; + + if (unlikely(rc != MDBX_RESULT_TRUE)) { + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); + goto bailout; } } if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { - mdbx_error("%s", "environment corrupted by died writer, must shutdown!"); + ERROR("%s", "environment corrupted by died writer, must shutdown!"); + if (likely(r)) + safe64_reset(&r->mr_txnid, false); + txn->mt_txnid = INVALID_TXNID; rc = MDBX_CORRUPTED; goto bailout; } - mdbx_assert(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); + eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); txn->mt_numdbs = env->me_numdbs; } else { - mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_WRITEMAP)) == 0); if (unlikely(txn->mt_owner == tid || /* not recovery mode */ env->me_stuck_meta >= 0)) return MDBX_BUSY; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && (env->me_flags & MDBX_NOTLS) == 0 && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { @@ -11446,8 +11449,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Not yet touching txn == env->me_txn0, it may be active */ - mdbx_jitter4testing(false); - rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY)); + jitter4testing(false); + rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); if (unlikely(rc)) return rc; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { @@ -11461,24 +11464,24 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } #endif /* Windows */ - mdbx_jitter4testing(false); - const MDBX_meta *meta = constmeta_prefer_last(env); + txn->tw.troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; while ( "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); + rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, + head.ptr_v, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - mdbx_jitter4testing(false); - txn->mt_canary = meta->mm_canary; - const txnid_t snap = constmeta_txnid(env, meta); - txn->mt_txnid = safe64_txnid_next(snap); + txn->mt_canary = head.ptr_c->mm_canary; + eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); + txn->mt_txnid = safe64_txnid_next(head.txnid); if (unlikely(txn->mt_txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } @@ -11499,11 +11502,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_numdbs = env->me_numdbs; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_geo = meta->mm_geo; + txn->mt_geo = head.ptr_c->mm_geo; - rc = mdbx_dpl_alloc(txn); + rc = dpl_alloc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; @@ -11511,7 +11514,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Setup db info */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { const unsigned db_flags = env->me_dbflags[i]; @@ -11525,7 +11528,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_warning("%s", "environment had fatal error, must shutdown!"); + WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { const size_t size = @@ -11538,11 +11541,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, - (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); + rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, + (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); if (rc != MDBX_SUCCESS) goto bailout; + } else { + env->me_dxb_mmap.current = size; + env->me_dxb_mmap.filesize = + (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) @@ -11554,23 +11561,19 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ - } else { - env->me_dxb_mmap.current = size; - env->me_dxb_mmap.filesize = - (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, txn); + txn_valgrind(env, txn); #endif txn->mt_owner = tid; return MDBX_SUCCESS; } bailout: - mdbx_tassert(txn, rc != MDBX_SUCCESS); - mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + tASSERT(txn, rc != MDBX_SUCCESS); + txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); return rc; } @@ -11584,13 +11587,19 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { if (unlikely(txn->mt_flags & bad_bits)) return MDBX_BAD_TXN; + tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == + ((txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_env->me_flags & MDBX_NOTLS + : 0)); #if MDBX_TXN_CHECKOWNER - if ((txn->mt_flags & MDBX_NOTLS) == 0 && - unlikely(txn->mt_owner != mdbx_thread_self())) + STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); + if (unlikely(txn->mt_owner != osal_thread_self()) && + (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) < + (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; #endif /* MDBX_TXN_CHECKOWNER */ - if (unlikely(!txn->mt_env->me_map)) + if (bad_bits && unlikely(!txn->mt_env->me_map)) return MDBX_EPERM; return MDBX_SUCCESS; @@ -11601,7 +11610,7 @@ static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { if (unlikely(err)) return err; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) return MDBX_EACCESS; return MDBX_SUCCESS; @@ -11624,14 +11633,14 @@ int mdbx_txn_renew(MDBX_txn *txn) { return rc; } - rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + rc = txn_renew(txn, MDBX_TXN_RDONLY); if (rc == MDBX_SUCCESS) { - txn->mt_owner = mdbx_thread_self(); - mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + txn->mt_owner = osal_thread_self(); + DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; } @@ -11688,31 +11697,31 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (env->me_options.spill_parent4child_denominator) { /* Spill dirty-pages of parent to provide dirtyroom for child txn */ - rc = mdbx_txn_spill(parent, nullptr, - parent->tw.dirtylist->length / - env->me_options.spill_parent4child_denominator); + rc = txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->me_options.spill_parent4child_denominator); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + tASSERT(parent, audit_ex(parent, 0, false) == 0); flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); } else if (flags & MDBX_TXN_RDONLY) { if (env->me_txn0 && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self()) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + unlikely(env->me_txn0->mt_owner == osal_thread_self()) && + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) return MDBX_TXN_OVERLAPPING; } else { /* Reuse preallocated write txn. However, do not touch it until - * mdbx_txn_renew0() succeeds, since it currently may be active. */ + * txn_renew() succeeds, since it currently may be active. */ txn = env->me_txn0; goto renew; } size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); size += tsize = sizeof(MDBX_txn); - if (unlikely((txn = mdbx_malloc(size)) == NULL)) { - mdbx_debug("calloc: %s", "failed"); + if (unlikely((txn = osal_malloc(size)) == NULL)) { + DEBUG("calloc: %s", "failed"); return MDBX_ENOMEM; } #if MDBX_DEBUG @@ -11728,23 +11737,23 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_env = env; if (parent) { - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_geo = parent->mt_geo; - rc = mdbx_dpl_alloc(txn); + rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { const unsigned len = MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; txn->tw.reclaimed_pglist = - mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); if (unlikely(!txn->tw.reclaimed_pglist)) rc = MDBX_ENOMEM; } if (unlikely(rc != MDBX_SUCCESS)) { nested_failed: - mdbx_pnl_free(txn->tw.reclaimed_pglist); - mdbx_dpl_free(txn); - mdbx_free(txn); + pnl_free(txn->tw.reclaimed_pglist); + dpl_free(txn); + osal_free(txn); return rc; } @@ -11752,40 +11761,39 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const unsigned di = mdbx_dpl_exist(parent, lp->mp_pgno); - mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp); - mdbx_tassert(parent, lp->mp_flags == P_LOOSE); - rc = - mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + const unsigned di = dpl_exist(parent, lp->mp_pgno); + tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); + tASSERT(parent, lp->mp_flags == P_LOOSE); + rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; parent->tw.loose_pages = lp->mp_next; /* Remove from dirty list */ - mdbx_page_wash(parent, di, lp, 1); + page_wash(parent, di, lp, 1); } while (parent->tw.loose_pages); parent->tw.loose_count = 0; #if MDBX_ENABLE_REFUND parent->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } txn->tw.dirtyroom = parent->tw.dirtyroom; txn->tw.dirtylru = parent->tw.dirtylru; - mdbx_dpl_sort(parent); + dpl_sort(parent); if (parent->tw.spill_pages) - mdbx_spill_purge(parent); + spill_purge(parent); - mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); - mdbx_assert(env, mdbx_pnl_check4assert( - txn->tw.reclaimed_pglist, - (txn->mt_next_pgno /* LY: intentional assignment here, - only for assertion */ - = parent->mt_next_pgno) - - MDBX_ENABLE_REFUND)); + eASSERT(env, pnl_check_allocated( + txn->tw.reclaimed_pglist, + (txn->mt_next_pgno /* LY: intentional assignment here, + only for assertion */ + = parent->mt_next_pgno) - + MDBX_ENABLE_REFUND)); txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { @@ -11810,55 +11818,56 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); + txn->tw.troika = parent->tw.troika; /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); env->me_txn = txn; - rc = mdbx_cursor_shadow(parent, txn); - if (mdbx_audit_enabled() && mdbx_assert_enabled()) { + rc = cursor_shadow(parent, txn); + if (AUDIT_ENABLED() && ASSERT_ENABLED()) { txn->mt_signature = MDBX_MT_SIGNATURE; - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); } if (unlikely(rc != MDBX_SUCCESS)) - mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + txn_end(txn, MDBX_END_FAIL_BEGINCHILD); } else { /* MDBX_TXN_RDONLY */ txn->mt_dbiseqs = env->me_dbiseqs; renew: - rc = mdbx_txn_renew0(txn, flags); + rc = txn_renew(txn, flags); } if (unlikely(rc != MDBX_SUCCESS)) { if (txn != env->me_txn0) - mdbx_free(txn); + osal_free(txn); } else { if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) - mdbx_assert(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); else if (flags & MDBX_TXN_RDONLY) - mdbx_assert(env, (txn->mt_flags & - ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | - /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); + eASSERT(env, (txn->mt_flags & + ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); else { - mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | - MDBX_TXN_SPILLS)) == 0); + eASSERT(env, (txn->mt_flags & + ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; *ret = txn; - mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; @@ -11874,7 +11883,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_env *const env = txn->mt_env; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -11884,24 +11893,21 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); if (txn->mt_flags & MDBX_TXN_RDONLY) { - volatile const MDBX_meta *head_meta; - txnid_t head_txnid; + meta_ptr_t head; uint64_t head_retired; + meta_troika_t troika = meta_tap(env); do { /* fetch info from volatile head */ - head_meta = meta_prefer_last(env); - head_txnid = meta_txnid(env, head_meta); + head = meta_recent(env, &troika); head_retired = - unaligned_peek_u64_volatile(4, head_meta->mm_pages_retired); - info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); - info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); info->txn_space_leftover = - pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - mdbx_compiler_barrier(); - } while (unlikely(head_meta != meta_prefer_last(env) || - head_txnid != meta_txnid(env, head_meta))); + pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); + } while (unlikely(meta_should_retry(env, &troika))); - info->txn_reader_lag = head_txnid - info->txn_id; + info->txn_reader_lag = head.txnid - info->txn_id; info->txn_space_dirty = info->txn_space_retired = 0; uint64_t reader_snapshot_pages_retired; if (txn->to.reader && @@ -11915,13 +11921,13 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && info->txn_reader_lag > 1 && lck) { /* find next more recent reader */ - txnid_t next_reader = head_txnid; + txnid_t next_reader = head.txnid; const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); const uint64_t snap_retired = @@ -11966,7 +11972,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { - oldest_snapshot = mdbx_find_oldest(txn); + oldest_snapshot = txn_oldest_reader(txn); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; @@ -12010,11 +12016,19 @@ int mdbx_txn_flags(const MDBX_txn *txn) { } /* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) +static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) { + if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) + return false; + if (likely( + txn->mt_dbiseqs[dbi].weak == + atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], + mo_AcquireRelease))) + return false; + return true; +} static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) { - unsigned v = env->me_dbiseqs[slot] + 1; + unsigned v = env->me_dbiseqs[slot].weak + 1; return v + (v == 0); } @@ -12025,22 +12039,22 @@ static void dbi_import_locked(MDBX_txn *txn) { if (i >= txn->mt_numdbs) { txn->mt_cursors[i] = NULL; if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[i] = 0; + txn->mt_dbiseqs[i].weak = 0; txn->mt_dbistate[i] = 0; } - if ((TXN_DBI_CHANGED(txn, i) && + if ((dbi_changed(txn, i) && (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || ((env->me_dbflags[i] & DB_VALID) && !(txn->mt_dbistate[i] & DBI_VALID))) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & - (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); + tASSERT(txn, + (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; txn->mt_dbistate[i] = 0; if (env->me_dbflags[i] & DB_VALID) { txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); - mdbx_tassert(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); } } } @@ -12051,7 +12065,7 @@ static void dbi_import_locked(MDBX_txn *txn) { else { if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[n] = 0; + txn->mt_dbiseqs[n].weak = 0; txn->mt_dbistate[n] = 0; } ++n; @@ -12065,17 +12079,17 @@ __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) return false; - mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); dbi_import_locked(txn); - mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); return txn->mt_dbistate[dbi] & DBI_USRVALID; } /* Export or close DBI handles opened in this txn. */ static void dbi_update(MDBX_txn *txn, int keep) { - mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); + tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; @@ -12085,11 +12099,11 @@ static void dbi_update(MDBX_txn *txn, int keep) { if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) continue; if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } - if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) + if (env->me_numdbs <= i || + txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) continue /* dbi explicitly closed and/or then re-opened by other txn */; if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; @@ -12097,11 +12111,11 @@ static void dbi_update(MDBX_txn *txn, int keep) { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { env->me_dbxs[i].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); - mdbx_assert(env, env->me_dbflags[i] == 0); - env->me_dbiseqs[i] = dbi_seq(env, i); + eASSERT(env, env->me_dbflags[i] == 0); + atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), + mo_AcquireRelease); env->me_dbxs[i].md_name.iov_base = NULL; - mdbx_free(ptr); + osal_free(ptr); } } } @@ -12109,8 +12123,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { n = env->me_numdbs; if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } @@ -12121,30 +12134,27 @@ static void dbi_update(MDBX_txn *txn, int keep) { } if (unlikely(locked)) - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } } /* Filter-out pgno list from transaction's dirty-page list */ -static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, - const bool spilled) { +static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(pl, (size_t)txn->mt_next_pgno << spilled)); - MDBX_dpl *dl = mdbx_dpl_sort(txn); + tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); + MDBX_dpl *dl = dpl_sort(txn); /* Scanning in ascend order */ const int step = MDBX_PNL_ASCENDING ? 1 : -1; const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; - mdbx_tassert(txn, pl[begin] <= pl[end - step]); + tASSERT(txn, pl[begin] <= pl[end - step]); - unsigned r = mdbx_dpl_search(txn, pl[begin] >> spilled); - mdbx_tassert(txn, dl->sorted == dl->length); + unsigned r = dpl_search(txn, pl[begin] >> spilled); + tASSERT(txn, dl->sorted == dl->length); for (int i = begin; r <= dl->length;) { /* scan loop */ assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pgno_t pl_pgno = pl[i] >> spilled; pgno_t dp_pgno = dl->items[r].pgno; if (likely(dp_pgno != pl_pgno)) { @@ -12157,12 +12167,12 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } /* update loop */ - unsigned w = r; + unsigned npages, w = r; remove_dl: - if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { - MDBX_page *dp = dl->items[r].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); - } + npages = dpl_npages(dl, r); + dl->pages_including_loose -= npages; + if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) + dpage_free(txn->mt_env, dl->items[r].ptr, npages); ++r; next_i: i += step; @@ -12172,7 +12182,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } else { while (r <= dl->length) { assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pl_pgno = pl[i] >> spilled; dp_pgno = dl->items[r].pgno; if (dp_pgno < pl_pgno) @@ -12185,8 +12195,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } dl->sorted = dpl_setlen(dl, w - 1); txn->tw.dirtyroom += r - w; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); return; @@ -12198,50 +12207,49 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, * May be called twice for readonly txns: First reset it, then abort. * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { +static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; static const char *const names[] = MDBX_END_NAMES; #if MDBX_ENV_CHECKPID - if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { + if (unlikely(txn->mt_env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } #endif /* MDBX_ENV_CHECKPID */ - mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - names[mode & MDBX_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + names[mode & MDBX_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ - mdbx_cursors_eot(txn, false); + cursors_eot(txn, false); int rc = MDBX_SUCCESS; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { + if (txn->mt_flags & MDBX_TXN_RDONLY) { if (txn->to.reader) { MDBX_reader *slot = txn->to.reader; - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); - if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { - mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak && - slot->mr_txnid.weak >= - env->me_lck->mti_oldest_reader.weak); + eASSERT(env, slot->mr_pid.weak == env->me_pid); + if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { + eASSERT(env, + txn->mt_txnid == slot->mr_txnid.weak && + slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } else { - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); - mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, slot->mr_pid.weak == env->me_pid); + eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -12251,53 +12259,54 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } #if defined(_WIN32) || defined(_WIN64) if (txn->mt_flags & MDBX_SHRINK_ALLOWED) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #endif txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; - } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { + } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; - mdbx_pnl_free(txn->tw.spill_pages); + pnl_free(txn->tw.spill_pages); txn->tw.spill_pages = nullptr; if (txn == env->me_txn0) { - mdbx_assert(env, txn->mt_parent == NULL); + eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ dbi_update(txn, mode & MDBX_END_UPDATE); - mdbx_pnl_shrink(&txn->tw.retired_pages); - mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); + pnl_shrink(&txn->tw.retired_pages); + pnl_shrink(&txn->tw.reclaimed_pglist); if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); + dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { - mdbx_assert(env, txn->mt_parent != NULL); + eASSERT(env, txn->mt_parent != NULL); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert( - env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, + sizeof(meta_troika_t)) == 0); if (txn->tw.lifo_reclaimed) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); + eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; } if (txn->tw.retired_pages) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= - (unsigned)(uintptr_t)parent->tw.retired_pages); + eASSERT(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= + (unsigned)(uintptr_t)parent->tw.retired_pages); MDBX_PNL_SIZE(txn->tw.retired_pages) = (unsigned)(uintptr_t)parent->tw.retired_pages; parent->tw.retired_pages = txn->tw.retired_pages; @@ -12306,33 +12315,33 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + tASSERT(parent, dirtylist_check(parent)); + tASSERT(parent, audit_ex(parent, 0, false) == 0); if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); - mdbx_dpl_free(txn); - mdbx_pnl_free(txn->tw.reclaimed_pglist); + dlist_free(txn); + dpl_free(txn); + pnl_free(txn->tw.reclaimed_pglist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { /* undo resize performed by child txn */ - rc = mdbx_mapresize_implicit(env, parent->mt_next_pgno, - parent->mt_geo.now, parent->mt_geo.upper); + rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper); if (rc == MDBX_EPERM) { /* unable undo resize (it is regular for Windows), * therefore promote size changes from child to the parent txn */ - mdbx_warning("unable undo resize performed by child txn, promote to " - "the parent (%u->%u, %u->%u)", - txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, - parent->mt_geo.upper); + WARNING("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, + parent->mt_geo.upper); parent->mt_geo.now = txn->mt_geo.now; parent->mt_geo.upper = txn->mt_geo.upper; parent->mt_flags |= MDBX_TXN_DIRTY; rc = MDBX_SUCCESS; } else if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_error("error %d while undo resize performed by child txn, fail " - "the parent", - rc); + ERROR("error %d while undo resize performed by child txn, fail " + "the parent", + rc); parent->mt_flags |= MDBX_TXN_ERROR; if (!env->me_dxb_mmap.address) env->me_flags |= MDBX_FATAL_ERROR; @@ -12341,10 +12350,10 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } } - mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0); + eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { txn->mt_signature = 0; - mdbx_free(txn); + osal_free(txn); } return rc; @@ -12360,10 +12369,10 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EINVAL; /* LY: don't close DBI-handles */ - rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); if (rc == MDBX_SUCCESS) { - mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); - mdbx_tassert(txn, txn->mt_owner == 0); + tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); + tASSERT(txn, txn->mt_owner == 0); } return rc; } @@ -12386,22 +12395,25 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) + if (txn->mt_flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | - MDBX_END_FREE); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | + MDBX_END_FREE); + + if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) + return MDBX_BAD_TXN; if (txn->mt_child) mdbx_txn_abort(txn->mt_child); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); + tASSERT(txn, dirtylist_check(txn)); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, - bool dont_filter_gc) { +__cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc) { pgno_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + @@ -12409,7 +12421,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, } MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI); + int rc = cursor_init(&cx.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12431,7 +12443,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, gc += *(pgno_t *)data.iov_base; skip:; } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] &= ~DBI_AUDITED; @@ -12440,7 +12452,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { if (!(txn->mt_dbistate[i] & DBI_VALID)) continue; - rc = mdbx_cursor_init(&cx.outer, txn, i); + rc = cursor_init(&cx.outer, txn, i); if (unlikely(rc != MDBX_SUCCESS)) return rc; txn->mt_dbistate[i] |= DBI_AUDITED; @@ -12451,7 +12463,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, if (i != MAIN_DBI) continue; - rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (rc == MDBX_SUCCESS) { MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; for (unsigned j = 0; j < page_numkeys(mp); j++) { @@ -12479,9 +12491,9 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; } } - rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); } for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { @@ -12496,12 +12508,12 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, break; } if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { - mdbx_warning("audit %s@%" PRIaTXN - ": unable account dbi %d / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, - (int)txn->mt_dbxs[i].md_name.iov_len, - (const char *)txn->mt_dbxs[i].md_name.iov_base, - txn->mt_dbistate[i]); + WARNING("audit %s@%" PRIaTXN + ": unable account dbi %d / \"%*s\", state 0x%02x", + txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, + (int)txn->mt_dbxs[i].md_name.iov_len, + (const char *)txn->mt_dbxs[i].md_name.iov_base, + txn->mt_dbistate[i]); } } @@ -12509,91 +12521,137 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_SUCCESS; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " - "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", - txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, - retired_stored); - mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO - "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO - "(allocated)", - txn->mt_txnid, pending, gc, used, pending + gc + used, - txn->mt_next_pgno); + ERROR("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " + "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", + txn->mt_txnid, pending, txn->tw.loose_count, + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, + retired_stored); + ERROR("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO + "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO + "(allocated)", + txn->mt_txnid, pending, gc, used, pending + gc + used, + txn->mt_next_pgno); return MDBX_PROBLEM; } -static __always_inline unsigned backlog_size(MDBX_txn *txn) { +typedef struct gc_update_context { + unsigned retired_stored, loop; + unsigned settled, cleaned_slot, reused_slot, filled_slot; + txnid_t cleaned_id, rid; + bool lifo, dense; +#if MDBX_ENABLE_BIGFOOT + txnid_t bigfoot; +#endif /* MDBX_ENABLE_BIGFOOT */ + MDBX_cursor_couple cursor; +} gcu_context_t; + +static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { + memset(ctx, 0, offsetof(gcu_context_t, cursor)); + ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; +#if MDBX_ENABLE_BIGFOOT + ctx->bigfoot = txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); +} + +static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; } -/* LY: Prepare a backlog of pages to modify GC itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ -static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, - const size_t pnl_bytes, unsigned *retired_stored) { - const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); +static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { + int err = MDBX_SUCCESS; + if (ctx->retired_stored) + do { + MDBX_val key, val; +#if MDBX_ENABLE_BIGFOOT + key.iov_base = &ctx->bigfoot; +#else + key.iov_base = &txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + key.iov_len = sizeof(txnid_t); + const struct cursor_set_result csr = + cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + if (csr.err == MDBX_SUCCESS && csr.exact) { + ctx->retired_stored = 0; + err = mdbx_cursor_del(&ctx->cursor.outer, 0); + TRACE("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), + err); + } + } +#if MDBX_ENABLE_BIGFOOT + while (!err && --ctx->bigfoot >= txn->mt_txnid); +#else + while (0); +#endif /* MDBX_ENABLE_BIGFOOT */ + return err; +} + +/* Prepare a backlog of pages to modify GC itself, while reclaiming is + * prohibited. It should be enough to prevent search in page_alloc_slowpath() + * during a deleting, when GC tree is unbalanced. */ +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, + const bool reserve4retired) { + const unsigned pages4retiredlist = + reserve4retired ? number_of_ovpages( + txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) + : 0; const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; const unsigned backlog4rebalance = backlog4cow + 1; - if (likely(linear4list == 1 && - backlog_size(txn) > (pnl_bytes - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) + if (likely(pages4retiredlist < 2 && + gcu_backlog_size(txn) > (reserve4retired + ? backlog4rebalance + : (backlog4cow + backlog4rebalance)))) return MDBX_SUCCESS; - mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", - pnl_bytes, backlog_size(txn), linear4list, backlog4cow, - backlog4rebalance); + TRACE(">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", + reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, + backlog4cow, backlog4rebalance); - MDBX_val gc_key, fake_val; int err; - if (unlikely(linear4list > 2)) { - gc_key.iov_base = fake_val.iov_base = nullptr; - gc_key.iov_len = sizeof(txnid_t); - fake_val.iov_len = pnl_bytes; - err = mdbx_cursor_spill(gc_cursor, &gc_key, &fake_val); + if (unlikely(pages4retiredlist > 2)) { + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); + err = cursor_spill(&ctx->cursor.outer, &key, &val); if (unlikely(err != MDBX_SUCCESS)) return err; } - gc_cursor->mc_flags &= ~C_RECLAIMING; - err = mdbx_cursor_touch(gc_cursor); - mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + err = cursor_touch(&ctx->cursor.outer); + TRACE("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); - if (unlikely(linear4list > 1) && err == MDBX_SUCCESS) { - if (retired_stored) { - gc_key.iov_base = &txn->mt_txnid; - gc_key.iov_len = sizeof(txn->mt_txnid); - const struct cursor_set_result csr = - mdbx_cursor_set(gc_cursor, &gc_key, &fake_val, MDBX_SET); - if (csr.err == MDBX_SUCCESS && csr.exact) { - *retired_stored = 0; - err = mdbx_cursor_del(gc_cursor, 0); - mdbx_trace("== clear-4linear, backlog %u, err %d", backlog_size(txn), - err); - } - } - err = - mdbx_page_alloc(gc_cursor, linear4list, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) - .err; - mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); - mdbx_cassert(gc_cursor, - backlog_size(txn) >= linear4list || err != MDBX_SUCCESS); + if (unlikely(pages4retiredlist > 1) && + MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored && + err == MDBX_SUCCESS) { + tASSERT(txn, reserve4retired); + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist, + MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) + .err; + TRACE("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err); + cASSERT(&ctx->cursor.outer, + gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); } - while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) - err = mdbx_page_alloc(gc_cursor, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE | - MDBX_ALLOC_NOLOG) + while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && + err == MDBX_SUCCESS) + err = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG) .err; - gc_cursor->mc_flags |= C_RECLAIMING; - mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); + ctx->cursor.outer.mc_flags |= C_RECLAIMING; + TRACE("<< backlog %u, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } -static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { +static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) @@ -12612,164 +12670,146 @@ static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { * "checks and balances") to partially bypass the fundamental design problems * inherited from LMDB. So do not try to understand it completely in order to * avoid your madness. */ -static int mdbx_update_gc(MDBX_txn *txn) { +static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { + TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid); + MDBX_env *const env = txn->mt_env; + const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; + (void)dbg_prefix_mode; + ctx->cursor.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; + /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * Page numbers cannot disappear from txn->tw.retired_pages[]. */ - MDBX_env *const env = txn->mt_env; - const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; - (void)dbg_prefix_mode; - mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); - - unsigned retired_stored = 0, loop = 0; - MDBX_cursor_couple couple; - int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout_notracking; - - couple.outer.mc_flags |= C_RECLAIMING; - couple.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &couple.outer; - bool dense_gc = false; retry: - ++loop; - mdbx_trace("%s", " >> restart"); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - mdbx_error("too more loops %u, bailout", loop); + ++ctx->loop; + TRACE("%s", " >> restart"); + int rc = MDBX_SUCCESS; + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); + if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { + ERROR("too more loops %u, bailout", ctx->loop); rc = MDBX_PROBLEM; goto bailout; } - if (unlikely(dense_gc) && retired_stored) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (unlikely(ctx->dense)) { + rc = gcu_clean_stored_retired(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, - filled_gc_slot = ~0u; - txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; + ctx->cleaned_id = 0; + ctx->rid = txn->tw.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ MDBX_val key, data; - mdbx_trace("%s", " >> continue"); + TRACE("%s", " >> continue"); - if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && - MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && + (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || + ctx->retired_stored > env->me_maxgc_ov1page)) { + rc = gcu_prepare_backlog(txn, ctx, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (lifo) { - if (cleaned_gc_slot < (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)) { - settled = 0; - cleaned_gc_slot = 0; - reused_gc_slot = 0; - filled_gc_slot = ~0u; + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (ctx->lifo) { + if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)) { + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; /* LY: cleanup reclaimed records. */ do { - cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; - mdbx_tassert(txn, - cleaned_gc_slot > 0 && - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - key.iov_base = &cleaned_gc_id; - key.iov_len = sizeof(cleaned_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); + ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; + tASSERT(txn, + ctx->cleaned_slot > 0 && + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + key.iov_base = &ctx->cleaned_id; + key.iov_len = sizeof(ctx->cleaned_id); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, - cleaned_gc_slot, cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_slot, ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); + rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); - mdbx_txl_sort(txn->tw.lifo_reclaimed); + } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + txl_sort(txn->tw.lifo_reclaimed); } } else { /* If using records from GC which we have not yet deleted, * now delete them and any we reserved for tw.reclaimed_pglist. */ - while (cleaned_gc_id <= txn->tw.last_reclaimed) { - rc = mdbx_cursor_first(&couple.outer, &key, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND) - break; + while (ctx->cleaned_id <= txn->tw.last_reclaimed) { + rc = cursor_first(&ctx->cursor.outer, &key, NULL); + if (rc == MDBX_NOTFOUND) + break; + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - gc_rid = cleaned_gc_id; - settled = 0; - reused_gc_slot = 0; - cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } - if (cleaned_gc_id > txn->tw.last_reclaimed) + ctx->rid = ctx->cleaned_id; + ctx->settled = 0; + ctx->reused_slot = 0; + ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); + if (ctx->cleaned_id > txn->tw.last_reclaimed) break; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); - mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, - cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); + rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } /* return suitable into unallocated space */ - if (mdbx_refund(txn)) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + if (txn_refund(txn)) { + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -12782,20 +12822,31 @@ retry: * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { + TRACE("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode, + txn->tw.loose_count); + rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE) + .err; + if (rc == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %u loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; + } + /* Put loose page numbers in tw.retired_pages, * since unable to return them to tw.reclaimed_pglist. */ - if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) goto bailout; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - mdbx_trace("%s: append %u loose-pages to retired-pages", - dbg_prefix_mode, txn->tw.loose_count); + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + TRACE("%s: append %u loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } } else { /* Room for loose pages + temp PNL with same */ - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, - 2 * txn->tw.loose_count + 2); + rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; MDBX_PNL loose = txn->tw.reclaimed_pglist + @@ -12803,15 +12854,15 @@ retry: txn->tw.loose_count - 1; unsigned count = 0; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { - mdbx_tassert(txn, mp->mp_flags == P_LOOSE); + tASSERT(txn, mp->mp_flags == P_LOOSE); loose[++count] = mp->mp_pgno; } - mdbx_tassert(txn, count == txn->tw.loose_count); + tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SIZE(loose) = count; - mdbx_pnl_sort(loose, txn->mt_next_pgno); - mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose); - mdbx_trace("%s: append %u loose-pages to reclaimed-pages", - dbg_prefix_mode, txn->tw.loose_count); + pnl_sort(loose, txn->mt_next_pgno); + pnl_merge(txn->tw.reclaimed_pglist, loose); + TRACE("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode, + txn->tw.loose_count); } /* filter-out list of dirty-pages from loose-pages */ @@ -12819,25 +12870,25 @@ retry: unsigned w = 0; for (unsigned r = w; ++r <= dl->length;) { MDBX_page *dp = dl->items[r].ptr; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); if ((dp->mp_flags & P_LOOSE) == 0) { if (++w != r) dl->items[w] = dl->items[r]; } else { - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + tASSERT(txn, dp->mp_flags == P_LOOSE); if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, 1); + dpage_free(env, dp, 1); } } - mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", - dbg_prefix_mode, dl->length, w); - mdbx_tassert(txn, txn->tw.loose_count == dl->length - w); + TRACE("%s: filtered-out loose-pages from %u -> %u dirty-pages", + dbg_prefix_mode, dl->length, w); + tASSERT(txn, txn->tw.loose_count == dl->length - w); dpl_setlen(dl, w); dl->sorted = 0; + dl->pages_including_loose -= txn->tw.loose_count; txn->tw.dirtyroom += txn->tw.loose_count; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); txn->tw.loose_pages = NULL; @@ -12849,50 +12900,106 @@ retry: const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); /* handle retired-list - store ones into single gc-record */ - if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { - if (unlikely(!retired_stored)) { + if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { + if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - couple.outer.mc_flags &= ~C_RECLAIMING; - rc = mdbx_page_search(&couple.outer, NULL, - MDBX_PS_LAST | MDBX_PS_MODIFY); - couple.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + rc = page_search(&ctx->cursor.outer, NULL, + MDBX_PS_LAST | MDBX_PS_MODIFY); + ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } + +#if MDBX_ENABLE_BIGFOOT + unsigned retired_pages_before; + do { + if (ctx->bigfoot > txn->mt_txnid) { + rc = gcu_clean_stored_retired(txn, ctx); + tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); + } + + retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages); + rc = gcu_prepare_backlog(txn, ctx, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + ctx->retired_stored = 0; + ctx->bigfoot = txn->mt_txnid; + do { + key.iov_len = sizeof(txnid_t); + key.iov_base = &ctx->bigfoot; + const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) - + ctx->retired_stored; + const unsigned chunk = + (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) + ? env->me_maxgc_ov1page + : left; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) { + const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING) + ? left - chunk + : ctx->retired_stored; + pgno_t *const begin = txn->tw.retired_pages + at; + /* MDBX_PNL_ASCENDING == false && LIFO == false: + * - the larger pgno is at the beginning of retired list + * and should be placed with the larger txnid. + * MDBX_PNL_ASCENDING == true && LIFO == true: + * - the larger pgno is at the ending of retired list + * and should be placed with the smaller txnid. + */ + const pgno_t save = *begin; + *begin = chunk; + memcpy(data.iov_base, begin, data.iov_len); + *begin = save; + TRACE("%s: put-retired/bigfoot @ %" PRIaTXN + " (slice #%u) #%u [%u..%u] of %u", + dbg_prefix_mode, ctx->bigfoot, + (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, + at + chunk, retired_pages_before); + } + ctx->retired_stored += chunk; + } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) && + (++ctx->bigfoot, true)); + } while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages)); +#else /* Write to last page of GC */ - key.iov_len = sizeof(txn->mt_txnid); + key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { + gcu_prepare_backlog(txn, ctx, true); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, &retired_stored); - rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); - mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); - mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, - retired_stored, txn->mt_txnid); - - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - unsigned i = retired_stored; - mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + ctx->retired_stored, txn->mt_txnid); +#endif /* MDBX_ENABLE_BIGFOOT */ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + unsigned i = ctx->retired_stored; + DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %u, retired-PNL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); - mdbx_debug_extra_print("%s\n", "."); + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); } if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && - settled)) { - mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", - dbg_prefix_mode, amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + ctx->settled)) { + TRACE("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, + amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -12901,33 +13008,32 @@ retry: } /* handle reclaimed and lost pages - merge and store both into gc */ - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, txn->tw.loose_count == 0); - mdbx_trace("%s", " >> reserving"); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + TRACE("%s", " >> reserving"); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const unsigned left = amount - settled; - mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " - "reused-gc-slots %u", - dbg_prefix_mode, amount, settled, (int)left, - txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0, - reused_gc_slot); + const unsigned left = amount - ctx->settled; + TRACE("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " + "reused-gc-slots %u", + dbg_prefix_mode, amount, ctx->settled, (int)left, + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0, + ctx->reused_slot); if (0 >= (int)left) break; const unsigned prefer_max_scatter = 257; txnid_t reservation_gc_id; - if (lifo) { + if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { rc = MDBX_ENOMEM; goto bailout; @@ -12936,35 +13042,35 @@ retry: if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page && - !dense_gc) { + !ctx->dense) { /* LY: need just a txn-id for save page list. */ bool need_cleanup = false; txnid_t snap_oldest; retry_rid: - couple.outer.mc_flags &= ~C_RECLAIMING; + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; do { - snap_oldest = mdbx_find_oldest(txn); - rc = - mdbx_page_alloc(&couple.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) - .err; + snap_oldest = txn_oldest_reader(txn); + rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE) + .err; if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, - MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); + TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, + MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); need_cleanup = true; } } while (rc == MDBX_SUCCESS && (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page); - couple.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); + TRACE("%s: got enough from GC.", dbg_prefix_mode); continue; } else if (unlikely(rc != MDBX_NOTFOUND)) /* LY: some troubles... */ @@ -12972,54 +13078,53 @@ retry: if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { if (need_cleanup) { - mdbx_txl_sort(txn->tw.lifo_reclaimed); - cleaned_gc_slot = 0; + txl_sort(txn->tw.lifo_reclaimed); + ctx->cleaned_slot = 0; } - gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); + ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { - mdbx_tassert(txn, txn->tw.last_reclaimed == 0); - if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) - /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + tASSERT(txn, txn->tw.last_reclaimed == 0); + if (unlikely(txn_oldest_reader(txn) != snap_oldest)) + /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; - mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, - dbg_prefix_mode, gc_rid); + txn->tw.last_reclaimed = ctx->rid = snap_oldest; + TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, + ctx->rid); } /* LY: GC is empty, will look any free txn-id in high2low order. */ while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page) { - if (unlikely(gc_rid <= MIN_TXNID)) { + if (unlikely(ctx->rid <= MIN_TXNID)) { if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= - reused_gc_slot)) { - mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " - "lifo_reclaimed %u" PRIaTXN, - reused_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + ctx->reused_slot)) { + NOTICE("** restart: reserve depleted (reused_gc_slot %u >= " + "lifo_reclaimed %u" PRIaTXN, + ctx->reused_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } break; } - mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); - --gc_rid; - key.iov_base = &gc_rid; - key.iov_len = sizeof(gc_rid); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); + --ctx->rid; + key.iov_base = &ctx->rid; + key.iov_len = sizeof(ctx->rid); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc == MDBX_SUCCESS)) { - mdbx_debug("%s: GC's id %" PRIaTXN - " is used, continue bottom-up search", - dbg_prefix_mode, gc_rid); - ++gc_rid; - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); + DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", + dbg_prefix_mode, ctx->rid); + ++ctx->rid; + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); if (rc == MDBX_NOTFOUND) { - mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode); - dense_gc = true; + DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode); + ctx->dense = true; break; } if (unlikely(rc != MDBX_SUCCESS || @@ -13028,98 +13133,86 @@ retry: goto bailout; } txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } if (gc_first <= MIN_TXNID) { - mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN - " (going dense-mode)", - dbg_prefix_mode, gc_rid); - dense_gc = true; + DEBUG("%s: no free GC's id(s) less than %" PRIaTXN + " (going dense-mode)", + dbg_prefix_mode, ctx->rid); + ctx->dense = true; break; } - gc_rid = gc_first - 1; + ctx->rid = gc_first - 1; } - mdbx_assert(env, !dense_gc); - rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); + eASSERT(env, !ctx->dense); + rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (reused_gc_slot) + if (ctx->reused_slot) /* rare case, but it is better to clear and re-create GC entries * with less fragmentation. */ need_cleanup = true; else - cleaned_gc_slot += + ctx->cleaned_slot += 1 /* mark cleanup is not needed for added slot. */; - mdbx_trace("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, gc_rid, cleaned_gc_slot); + TRACE("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); } - if (need_cleanup || dense_gc) { - if (cleaned_gc_slot) - mdbx_trace( - "%s: restart inner-loop to clear and re-create GC entries", - dbg_prefix_mode); - cleaned_gc_slot = 0; + if (need_cleanup || ctx->dense) { + if (ctx->cleaned_slot) + TRACE("%s: restart inner-loop to clear and re-create GC entries", + dbg_prefix_mode); + ctx->cleaned_slot = 0; continue; } } const unsigned i = - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; - mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; + tASSERT(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); reservation_gc_id = txn->tw.lifo_reclaimed[i]; - mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", - dbg_prefix_mode, reservation_gc_id, i); + TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, + reservation_gc_id, i); } else { - mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); - if (unlikely(gc_rid == 0)) { - gc_rid = mdbx_find_oldest(txn) - 1; - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST); + tASSERT(txn, txn->tw.lifo_reclaimed == NULL); + if (unlikely(ctx->rid == 0)) { + ctx->rid = txn_oldest_reader(txn); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); if (rc == MDBX_SUCCESS) { - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } - if (gc_rid >= gc_first) - gc_rid = gc_first - 1; - if (unlikely(gc_rid == 0)) { - mdbx_error("%s", "** no GC tail-space to store (going dense-mode)"); - dense_gc = true; + if (ctx->rid >= gc_first) + ctx->rid = gc_first - 1; + if (unlikely(ctx->rid == 0)) { + ERROR("%s", "** no GC tail-space to store (going dense-mode)"); + ctx->dense = true; goto retry; } } else if (rc != MDBX_NOTFOUND) goto bailout; - txn->tw.last_reclaimed = gc_rid; - cleaned_gc_id = gc_rid + 1; + txn->tw.last_reclaimed = ctx->rid; + ctx->cleaned_id = ctx->rid + 1; } - reservation_gc_id = gc_rid--; - mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, - reservation_gc_id); + reservation_gc_id = ctx->rid--; + TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); } - ++reused_gc_slot; + ++ctx->reused_slot; unsigned chunk = left; if (unlikely(chunk > env->me_maxgc_ov1page)) { const unsigned avail_gc_slots = txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot + 1 - : (gc_rid < INT16_MAX) ? (unsigned)gc_rid - : INT16_MAX; + ctx->reused_slot + 1 + : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid + : INT16_MAX; if (avail_gc_slots > 1) { if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; @@ -13153,29 +13246,29 @@ retry: chunk = (avail >= tail) ? tail - span : (avail_gc_slots > 3 && - reused_gc_slot < prefer_max_scatter - 3) + ctx->reused_slot < prefer_max_scatter - 3) ? avail - span : tail; } } } } - mdbx_tassert(txn, chunk > 0); + tASSERT(txn, chunk > 0); - mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " - "%" PRIaTXN, - dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); + TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); - mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, - env->me_maxgc_ov1page); + TRACE("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, + env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); if (unlikely( reservation_gc_id < MIN_TXNID || - reservation_gc_id >= + reservation_gc_id > atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { - mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", - reservation_gc_id); + ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); rc = MDBX_PROBLEM; goto bailout; } @@ -13183,50 +13276,47 @@ retry: key.iov_len = sizeof(reservation_gc_id); key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); - mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, - settled + 1, settled + chunk + 1, reservation_gc_id); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, nullptr); - rc = mdbx_cursor_put(&couple.outer, &key, &data, + TRACE("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, + ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + gcu_prepare_backlog(txn, ctx, true); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); - settled += chunk; - mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, settled, - chunk); + gcu_clean_reserved(env, data); + ctx->settled += chunk; + TRACE("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, + chunk); if (txn->tw.lifo_reclaimed && unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && - (loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > - env->me_maxgc_ov1page)) { - mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > + env->me_maxgc_ov1page)) { + NOTICE("** restart: reclaimed-list growth %u -> %u", amount, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry; } continue; } - mdbx_tassert( - txn, - cleaned_gc_slot == - (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); + tASSERT(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)); - mdbx_trace("%s", " >> filling"); + TRACE("%s", " >> filling"); /* Fill in the reserved records */ - filled_gc_slot = + ctx->filled_slot = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot - : reused_gc_slot; + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + : ctx->reused_slot; rc = MDBX_SUCCESS; - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ @@ -13235,95 +13325,97 @@ retry: const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); unsigned left = amount; if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_first(&couple.outer, &key, &data); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_first(&ctx->cursor.outer, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + tASSERT(txn, ctx->lifo != 0); } while (true) { txnid_t fill_gc_id; - mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: left %u of %u", dbg_prefix_mode, left, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); + tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { - mdbx_notice( + if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { + NOTICE( "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, - filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); + ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { - mdbx_tassert(txn, lifo != 0); - if (++filled_gc_slot > + tASSERT(txn, ctx->lifo != 0); + if (++ctx->filled_slot > (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { - mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " - "lifo_reclaimed %u" PRIaTXN, - filled_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + NOTICE("** restart: reserve depleted (filled_gc_slot %u > " + "lifo_reclaimed %u" PRIaTXN, + ctx->filled_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } - fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; - mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", - dbg_prefix_mode, fill_gc_id, filled_gc_slot); + fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; + TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_slot == - (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)); - mdbx_tassert(txn, fill_gc_id > 0 && - fill_gc_id < env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, + ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)); + tASSERT(txn, fill_gc_id > 0 && + fill_gc_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - couple.outer.mc_flags |= C_GCFREEZE; + tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); + ctx->cursor.outer.mc_flags |= C_GCFREEZE; unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; if (unlikely(chunk > left)) { - mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, - left, fill_gc_id); - if ((loop < 5 && chunk - left > loop / 2) || + TRACE("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, + left, fill_gc_id); + if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (loop < 7) - couple.outer.mc_flags &= ~C_GCFREEZE; + if (ctx->loop < 7) + ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; } chunk = left; } - rc = mdbx_cursor_put(&couple.outer, &key, &data, + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - couple.outer.mc_flags &= ~C_GCFREEZE; + ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); + gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { - mdbx_notice("** restart: reclaimed-list growth (%u -> %u, loose +%u)", - amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.loose_count); + NOTICE("** restart: reclaimed-list growth (%u -> %u, loose +%u)", + amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed - ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : cleaned_gc_id < txn->tw.last_reclaimed)) { - mdbx_notice("%s", "** restart: reclaimed-slots changed"); + ? ctx->cleaned_slot < + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : ctx->cleaned_id < txn->tw.last_reclaimed)) { + NOTICE("%s", "** restart: reclaimed-slots changed"); goto retry; } - if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { - mdbx_tassert(txn, - retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); - mdbx_notice("** restart: retired-list growth (%u -> %u)", - retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + if (unlikely(ctx->retired_stored != + MDBX_PNL_SIZE(txn->tw.retired_pages))) { + tASSERT(txn, + ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); + NOTICE("** restart: retired-list growth (%u -> %u)", + ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); goto retry; } @@ -13332,15 +13424,14 @@ retry: pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; - mdbx_trace("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO - "] @%" PRIaTXN, - dbg_prefix_mode, chunk, - (unsigned)(from - txn->tw.reclaimed_pglist), from[0], - (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id); + TRACE("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN, + dbg_prefix_mode, chunk, (unsigned)(from - txn->tw.reclaimed_pglist), + from[0], (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], + fill_gc_id); left -= chunk; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored + amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -13350,49 +13441,47 @@ retry: } if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + tASSERT(txn, ctx->lifo != 0); } } } - mdbx_tassert(txn, rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { - mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count); + NOTICE("** restart: got %u loose pages", txn->tw.loose_count); goto retry; } - if (unlikely(filled_gc_slot != + if (unlikely(ctx->filled_slot != (txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0))) { - const bool will_retry = loop < 9; - mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)", - will_retry ? "restart" : "ignore", filled_gc_slot, loop); + const bool will_retry = ctx->loop < 9; + NOTICE("** %s: reserve excess (filled-slot %u, loop %u)", + will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); if (will_retry) goto retry; } - mdbx_tassert(txn, - txn->tw.lifo_reclaimed == NULL || - cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + tASSERT(txn, txn->tw.lifo_reclaimed == NULL || + ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; + txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; -bailout_notracking: MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; - mdbx_trace("<<< %u loops, rc = %d", loop, rc); + TRACE("<<< %u loops, rc = %d", ctx->loop, rc); return rc; } -static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { +static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { MDBX_dpl *const dl = - (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : mdbx_dpl_sort(txn); + (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); int rc = MDBX_SUCCESS; unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { @@ -13407,17 +13496,20 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { break; } - if (ctx->iov_items) - rc = mdbx_iov_write(txn, ctx); + if (ctx->iov_items) { + /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(txn, ctx); + } while (r <= dl->length) dl->items[++w] = dl->items[r++]; dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += r - 1 - w; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); return rc; } @@ -13425,8 +13517,7 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, unsigned validity) { if (likely(dbi < txn->mt_numdbs)) { - mdbx_memory_fence(mo_AcquireRelease, false); - if (likely(!TXN_DBI_CHANGED(txn, dbi))) { + if (likely(!dbi_changed(txn, dbi))) { if (likely(txn->mt_dbistate[dbi] & validity)) return true; if (likely(dbi < CORE_DBS || @@ -13442,48 +13533,48 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Merge child txn into parent */ -static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, - const unsigned parent_retired_len) { - MDBX_dpl *const src = mdbx_dpl_sort(txn); +static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const unsigned parent_retired_len) { + MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ - MDBX_dpl *const dst = mdbx_dpl_sort(parent); + MDBX_dpl *const dst = dpl_sort(parent); if (MDBX_ENABLE_REFUND) { unsigned n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - MDBX_page *dp = dst->items[n].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); + unsigned npages = dpl_npages(dst, n); + dpage_free(txn->mt_env, dst->items[n].ptr, npages); } --n; } parent->tw.dirtyroom += dst->sorted - n; dst->sorted = dpl_setlen(dst, n); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Remove reclaimed pages from parent's dirty list */ const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; - mdbx_dpl_sift(parent, reclaimed_list, false); + dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ unsigned r, w, d, s, l; for (r = w = parent_retired_len; ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { const pgno_t pgno = parent->tw.retired_pages[r]; - const unsigned di = mdbx_dpl_exist(parent, pgno); - const unsigned si = !di ? mdbx_search_spilled(parent, pgno) : 0; + const unsigned di = dpl_exist(parent, pgno); + const unsigned si = !di ? search_spilled(parent, pgno) : 0; unsigned npages; const char *kind; if (di) { MDBX_page *dp = dst->items[di].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_SPILLED)) == 0); npages = dpl_npages(dst, di); - mdbx_page_wash(parent, di, dp, npages); + page_wash(parent, di, dp, npages); kind = "dirty"; l = 1; if (unlikely(npages > l)) { @@ -13515,23 +13606,23 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } else if (unlikely(si)) { l = npages = 1; - mdbx_spill_remove(parent, si, 1); + spill_remove(parent, si, 1); kind = "spilled"; } else { parent->tw.retired_pages[++w] = pgno; continue; } - mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, - kind, pgno); - int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); - mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS); + DEBUG("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, kind, + pgno); + int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + ENSURE(txn->mt_env, err == MDBX_SUCCESS); } MDBX_PNL_SIZE(parent->tw.retired_pages) = w; /* Filter-out parent spill list */ if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { - const MDBX_PNL sl = mdbx_spill_purge(parent); + const MDBX_PNL sl = spill_purge(parent); unsigned len = MDBX_PNL_SIZE(sl); if (len) { /* Remove refunded pages from parent's spill list */ @@ -13542,7 +13633,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); do { if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); MDBX_PNL_SIZE(sl) = i; @@ -13552,14 +13643,13 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, do { ++i; if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); MDBX_PNL_SIZE(sl) = len -= i; memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); #endif } - mdbx_tassert( - txn, mdbx_pnl_check4assert(sl, (size_t)parent->mt_next_pgno << 1)); + tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); /* Remove reclaimed pages from parent's spill list */ s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); @@ -13576,9 +13666,9 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, s -= !cmp; r -= cmp; } else { - mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO, - reclaimed_pgno); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + spill_remove(parent, s, 1); --s; --r; } @@ -13607,41 +13697,40 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, continue; } - mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, - dirty_pgno_form); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + spill_remove(parent, s, 1); s += step; } /* Squash deleted pagenums if we deleted any */ - mdbx_spill_purge(parent); + spill_purge(parent); } } /* Remove anything in our spill list from parent's dirty list */ if (txn->tw.spill_pages) { - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); - mdbx_dpl_sift(parent, txn->tw.spill_pages, true); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages, + (size_t)parent->mt_next_pgno << 1)); + dpl_sift(parent, txn->tw.spill_pages, true); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Find length of merging our dirty list with parent's and release * filter-out pages */ for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); const unsigned s_npages = dpl_npages(src, s); const pgno_t s_pgno = src->items[s].pgno; MDBX_page *dp = dst->items[d].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_SPILLED)) == 0); const unsigned d_npages = dpl_npages(dst, d); const pgno_t d_pgno = dst->items[d].pgno; @@ -13658,18 +13747,17 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } else { dst->items[d--].ptr = nullptr; if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, d_npages); + dpage_free(txn->mt_env, dp, d_npages); } } assert(dst->sorted == dst->length); - mdbx_tassert(parent, dst->detent >= l + d + s); + tASSERT(parent, dst->detent >= l + d + s); dst->sorted = l + d + s; /* the merged length */ while (s > 0) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); if (sp->mp_flags != P_LOOSE) { sp->mp_txnid = parent->mt_front; sp->mp_flags &= ~P_SPILLED; @@ -13691,7 +13779,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } ++w; } - mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1); + NOTICE("squash to begin for extending-merge %u -> %u", d, w - 1); d = w - 1; continue; } @@ -13721,7 +13809,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } } else { - /* from begin to end with dst shrinking (a lot of new overflow pages) */ + /* from begin to end with shrinking (a lot of new large/overflow pages) */ for (l = s = d = 1; s <= src->length && d <= dst->length;) { if (unlikely(l >= d)) { /* squash to get a gap of free space for merge */ @@ -13733,7 +13821,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } --w; } - mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1); + NOTICE("squash to end for shrinking-merge %u -> %u", d, w + 1); d = w + 1; continue; } @@ -13767,25 +13855,32 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_dpl_free(txn); + + /* В текущем понимании выгоднее пересчитать кол-во страниц, + * чем подмешивать лишние ветвления и вычисления в циклы выше. */ + dst->pages_including_loose = 0; + for (r = 1; r <= dst->length; ++r) + dst->pages_including_loose += dpl_npages(dst, r); + + tASSERT(parent, dirtylist_check(parent)); + dpl_free(txn); if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { /* Must not fail since space was preserved above. */ - mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages); - mdbx_pnl_free(txn->tw.spill_pages); + pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); + pnl_free(txn->tw.spill_pages); } else { parent->tw.spill_pages = txn->tw.spill_pages; parent->tw.spill_least_removed = txn->tw.spill_least_removed; } - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; if (parent->tw.spill_pages) { - assert(mdbx_pnl_check4assert(parent->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); + assert(pnl_check_allocated(parent->tw.spill_pages, + (size_t)parent->mt_next_pgno << 1)); if (MDBX_PNL_SIZE(parent->tw.spill_pages)) parent->mt_flags |= MDBX_TXN_SPILLS; } @@ -13794,7 +13889,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); - const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; + const uint64_t ts_0 = latency ? osal_monotime() : 0; uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; uint32_t audit_duration = 0; @@ -13809,58 +13904,58 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_env *env = txn->mt_env; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; rc = MDBX_PANIC; goto provide_latency; } #endif /* MDBX_ENV_CHECKPID */ - /* mdbx_txn_end() mode for a commit which writes nothing */ + /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) goto done; if (txn->mt_child) { rc = mdbx_txn_commit_ex(txn->mt_child, NULL); - mdbx_tassert(txn, txn->mt_child == NULL); + tASSERT(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } if (unlikely(txn != env->me_txn)) { - mdbx_debug("%s", "attempt to commit unknown transaction"); + DEBUG("%s", "attempt to commit unknown transaction"); rc = MDBX_EINVAL; goto fail; } if (txn->mt_parent) { - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); - mdbx_assert(env, txn != env->me_txn0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); + eASSERT(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert(env, mdbx_dirtylist_check(txn)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, dirtylist_check(txn)); if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { for (int i = txn->mt_numdbs; --i >= 0;) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); if ((txn->mt_dbistate[i] & DBI_STALE) && !(parent->mt_dbistate[i] & DBI_STALE)) - mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], - sizeof(MDBX_db)) == 0); + tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); } - mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, - sizeof(parent->mt_geo)) == 0); - mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, - sizeof(parent->mt_canary)) == 0); - mdbx_tassert(txn, !txn->tw.spill_pages || - MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + tASSERT(txn, + !txn->tw.spill_pages || MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); + tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; @@ -13871,30 +13966,29 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { * if allocation fails. */ const unsigned parent_retired_len = (unsigned)(uintptr_t)parent->tw.retired_pages; - mdbx_tassert(txn, - parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); + tASSERT(txn, parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); const unsigned retired_delta = MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { - rc = mdbx_pnl_need(&parent->tw.spill_pages, - MDBX_PNL_SIZE(txn->tw.spill_pages)); + rc = pnl_need(&parent->tw.spill_pages, + MDBX_PNL_SIZE(txn->tw.spill_pages)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - mdbx_spill_purge(txn); + spill_purge(txn); } if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent && - !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length + - parent->tw.dirtylist->length))) { + !dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { rc = MDBX_ENOMEM; goto fail; } @@ -13907,7 +14001,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.retired_pages = txn->tw.retired_pages; txn->tw.retired_pages = NULL; - mdbx_pnl_free(parent->tw.reclaimed_pglist); + pnl_free(parent->tw.reclaimed_pglist); parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; txn->tw.reclaimed_pglist = NULL; parent->tw.last_reclaimed = txn->tw.last_reclaimed; @@ -13924,7 +14018,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.loose_pages = txn->tw.loose_pages; /* Merge our cursors into parent's and close them */ - mdbx_cursors_eot(txn, true); + cursors_eot(txn, true); end_mode |= MDBX_END_EOTDONE; /* Update parent's DBs array */ @@ -13935,53 +14029,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { const uint8_t state = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i, - (parent->mt_dbistate[i] != state) ? "update" : "still", - parent->mt_dbistate[i], state); + DEBUG("db %u dbi-state %s 0x%02x -> 0x%02x", i, + (parent->mt_dbistate[i] != state) ? "update" : "still", + parent->mt_dbistate[i], state); parent->mt_dbistate[i] = state; } - ts_1 = latency ? mdbx_osal_monotime() : 0; - mdbx_txn_merge(parent, txn, parent_retired_len); - ts_2 = latency ? mdbx_osal_monotime() : 0; + ts_1 = latency ? osal_monotime() : 0; + txn_merge(parent, txn, parent_retired_len); + ts_2 = latency ? osal_monotime() : 0; env->me_txn = parent; parent->mt_child = NULL; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); #if MDBX_ENABLE_REFUND - mdbx_refund(parent); - if (mdbx_assert_enabled()) { + txn_refund(parent); + if (ASSERT_ENABLED()) { /* Check parent's loose pages not suitable for refund */ for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) - mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl && - lp->mp_pgno + 1 < parent->mt_next_pgno); + tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && + lp->mp_pgno + 1 < parent->mt_next_pgno); /* Check parent's reclaimed pages not suitable for refund */ if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) - mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < - parent->mt_next_pgno); + tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < + parent->mt_next_pgno); } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; + ts_4 = ts_3 = latency ? osal_monotime() : 0; txn->mt_signature = 0; - mdbx_free(txn); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + osal_free(txn); + tASSERT(parent, audit_ex(parent, 0, false) == 0); rc = MDBX_SUCCESS; goto provide_latency; } - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - mdbx_cursors_eot(txn, false); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; if (txn->tw.dirtylist->length == 0 && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { for (int i = txn->mt_numdbs; --i >= 0;) - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT - rc = mdbx_txn_end(txn, end_mode); + rc = txn_end(txn, end_mode); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = MDBX_RESULT_TRUE; @@ -13991,10 +14085,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ } - mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (void *)txn, (void *)env, - txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { @@ -14002,15 +14096,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_val data; data.iov_len = sizeof(MDBX_db); - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbistate[i] & DBI_DIRTY) { MDBX_db *db = &txn->mt_dbs[i]; - mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); + DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; WITH_CURSOR_TRACKING(couple.outer, @@ -14023,8 +14118,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } } - ts_1 = latency ? mdbx_osal_monotime() : 0; - rc = mdbx_update_gc(txn); + ts_1 = latency ? osal_monotime() : 0; + gcu_context_t gcu_ctx; + rc = gcu_context_init(txn, &gcu_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = update_gc(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -14036,44 +14135,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = latency ? mdbx_osal_monotime() : 0; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); - const uint64_t audit_end = mdbx_osal_monotime(); - audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); + ts_2 = latency ? osal_monotime() : 0; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + const uint64_t audit_end = osal_monotime(); + audit_duration = osal_monotime_to_16dot16(audit_end - ts_2); ts_2 = audit_end; if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); - rc = mdbx_txn_write(txn, &ctx); + struct iov_ctx write_ctx; + iov_init(txn, &write_ctx); + rc = txn_write(txn, &write_ctx); if (likely(rc == MDBX_SUCCESS)) - mdbx_iov_done(txn, &ctx); + iov_done(txn, &write_ctx); /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_3 = latency ? mdbx_osal_monotime() : 0; + ts_3 = latency ? osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { - const MDBX_meta *head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8); - meta.mm_extra_flags = head->mm_extra_flags; - meta.mm_validator_id = head->mm_validator_id; - meta.mm_extra_pagehdr = head->mm_extra_pagehdr; + memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); + meta.mm_extra_flags = head.ptr_c->mm_extra_flags; + meta.mm_validator_id = head.ptr_c->mm_validator_id; + meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head->mm_pages_retired) + + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + MDBX_PNL_SIZE(txn->tw.retired_pages)); meta.mm_geo = txn->mt_geo; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_canary = txn->mt_canary; - meta_set_txnid(env, &meta, txn->mt_txnid); - rc = mdbx_sync_locked( - env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); + txnid_t commit_txnid = txn->mt_txnid; +#if MDBX_ENABLE_BIGFOOT + if (gcu_ctx.bigfoot > txn->mt_txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, + (unsigned)(commit_txnid - txn->mt_txnid)); + } +#endif + meta_set_txnid(env, &meta, commit_txnid); + + rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, + &meta, &txn->tw.troika); } - ts_4 = latency ? mdbx_osal_monotime() : 0; + ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; goto fail; @@ -14082,22 +14190,18 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: - rc = mdbx_txn_end(txn, end_mode); + rc = txn_end(txn, end_mode); provide_latency: if (latency) { latency->audit = audit_duration; - latency->preparation = - ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc = - (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->write = - (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->sync = - (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - const uint64_t ts_5 = mdbx_osal_monotime(); - latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->write = (ts_2 && ts_3) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->sync = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + const uint64_t ts_5 = osal_monotime(); + latency->ending = ts_4 ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->whole = osal_monotime_to_16dot16(ts_5 - ts_0); } return rc; @@ -14106,82 +14210,78 @@ fail: goto provide_latency; } -static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, - const MDBX_page *const page, - const unsigned meta_number, - unsigned *guess_pagesize) { +static int validate_meta(MDBX_env *env, MDBX_meta *const meta, + const MDBX_page *const page, + const unsigned meta_number, unsigned *guess_pagesize) { const uint64_t magic_and_version = unaligned_peek_u64(4, &meta->mm_magic_and_version); if (unlikely(magic_and_version != MDBX_DATA_MAGIC && magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { - mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - magic_and_version); + ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, + magic_and_version); return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID : MDBX_VERSION_MISMATCH; } if (unlikely(page->mp_pgno != meta_number)) { - mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, - page->mp_pgno); + ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno); return MDBX_INVALID; } if (unlikely(page->mp_flags != P_META)) { - mdbx_error("page #%u not a meta-page", meta_number); + ERROR("page #%u not a meta-page", meta_number); return MDBX_INVALID; } /* LY: check pagesize */ if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || meta->mm_psize > MAX_PAGESIZE)) { - mdbx_warning("meta[%u] has invalid pagesize (%u), skip it", meta_number, - meta->mm_psize); + WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->mm_psize); return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; } if (guess_pagesize && *guess_pagesize != meta->mm_psize) { *guess_pagesize = meta->mm_psize; - mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); + VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize); } const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a); if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) { - mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + WARNING("meta[%u] not completely updated, skip it", meta_number); return MDBX_RESULT_TRUE; } /* LY: check signature as a checksum */ if (META_IS_STEADY(meta) && - unlikely(unaligned_peek_u64(4, &meta->mm_datasync_sign) != - meta_sign(meta))) { - mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64 - " != 0x%" PRIx64 "), skip it", - meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), - meta_sign(meta)); + unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) { + WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 + "), skip it", + meta_number, unaligned_peek_u64(4, &meta->mm_sign), + meta_sign(meta)); return MDBX_RESULT_TRUE; } - mdbx_debug("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, - meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, - meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, - pv2pages(meta->mm_geo.grow_pv), pv2pages(meta->mm_geo.shrink_pv), - txnid, mdbx_durable_str(meta)); + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, + meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, + meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv), + pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta)); if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { - mdbx_warning("meta[%u] has invalid txnid %" PRIaTXN ", skip it", - meta_number, txnid); + WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, + txnid); return MDBX_RESULT_TRUE; } /* LY: check min-pages value */ if (unlikely(meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO + 1)) { - mdbx_warning("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.lower); + WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.lower); return MDBX_INVALID; } @@ -14189,16 +14289,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(meta->mm_geo.upper < MIN_PAGENO || meta->mm_geo.upper > MAX_PAGENO + 1 || meta->mm_geo.upper < meta->mm_geo.lower)) { - mdbx_warning("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.upper); + WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.upper); return MDBX_INVALID; } /* LY: check last_pgno */ if (unlikely(meta->mm_geo.next < MIN_PAGENO || meta->mm_geo.next - 1 > MAX_PAGENO)) { - mdbx_warning("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next); + WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next); return MDBX_CORRUPTED; } @@ -14206,20 +14306,20 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { /* Here could be a race with DB-shrinking performed by other process */ - int err = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); + int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(err != MDBX_SUCCESS)) return err; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { - mdbx_warning("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 - "), skip it", - meta_number, used_bytes, env->me_dxb_mmap.filesize); + WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, env->me_dxb_mmap.filesize); return MDBX_CORRUPTED; } } if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO || used_bytes > MAX_MAPSIZE)) { - mdbx_warning("meta[%u] has too large used-space (%" PRIu64 "), skip it", - meta_number, used_bytes); + WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", + meta_number, used_bytes); return MDBX_TOO_LARGE; } @@ -14232,24 +14332,24 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && mapsize_min <= MAX_MAPSIZE64) { - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large min-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_min, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_min, used_bytes); geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize); if (geo_lower > MAX_PAGENO + 1) { geo_lower = MAX_PAGENO + 1; mapsize_min = geo_lower * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "lower", geo_lower, meta->mm_geo.lower); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "lower", geo_lower, meta->mm_geo.lower); meta->mm_geo.lower = geo_lower; } else { - mdbx_warning("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_min); + WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); return MDBX_VERSION_MISMATCH; } } @@ -14262,25 +14362,25 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / (size_t)meta->mm_psize)) { if (mapsize_max > MAX_MAPSIZE64) { - mdbx_warning("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_max); + WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); return MDBX_VERSION_MISMATCH; } /* allow to open large DB from a 32-bit environment */ - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large max-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_max, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize); if (geo_upper > MAX_PAGENO + 1) { geo_upper = MAX_PAGENO + 1; mapsize_max = geo_upper * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "upper", geo_upper, meta->mm_geo.upper); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "upper", geo_upper, meta->mm_geo.upper); meta->mm_geo.upper = geo_upper; } @@ -14298,16 +14398,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, geo_now = geo_upper; if (unlikely(meta->mm_geo.next > geo_now)) { - mdbx_warning("meta[%u] next-pageno (%" PRIaPGNO - ") is beyond end-pgno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next, geo_now); + WARNING("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next, geo_now); return MDBX_CORRUPTED; } if (meta->mm_geo.now != geo_now) { - mdbx_warning("meta[%u] consider geo-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "now", geo_now, meta->mm_geo.now); + WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "now", geo_now, meta->mm_geo.now); meta->mm_geo.now = geo_now; } @@ -14318,12 +14418,12 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[FREE_DBI].md_entries || meta->mm_dbs[FREE_DBI].md_leaf_pages || meta->mm_dbs[FREE_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s, skip it", meta_number, "GC"); + WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "GC", meta->mm_dbs[FREE_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "GC", meta->mm_dbs[FREE_DBI].md_root); return MDBX_CORRUPTED; } @@ -14334,49 +14434,48 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[MAIN_DBI].md_entries || meta->mm_dbs[MAIN_DBI].md_leaf_pages || meta->mm_dbs[MAIN_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s", meta_number, "MainDB"); + WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "MainDB", meta->mm_dbs[MAIN_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "MainDB", meta->mm_dbs[MAIN_DBI].md_root); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); return MDBX_CORRUPTED; } return MDBX_SUCCESS; } -static int mdbx_validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, - MDBX_meta *dest) { +static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, + MDBX_meta *dest) { *dest = *meta; - return mdbx_validate_meta(env, dest, data_page(meta), - bytes2pgno(env, (uint8_t *)meta - env->me_map), - nullptr); + return validate_meta(env, dest, data_page(meta), + bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr); } /* Read the environment parameters of a DB environment * before mapping it into memory. */ -__cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, - const int lck_exclusive, - const mdbx_mode_t mode_bits) { - int rc = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); +__cold static int read_header(MDBX_env *env, MDBX_meta *dest, + const int lck_exclusive, + const mdbx_mode_t mode_bits) { + int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; memset(dest, 0, sizeof(MDBX_meta)); - unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ @@ -14393,54 +14492,61 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, char buffer[MIN_PAGESIZE]; unsigned retryleft = 42; while (1) { - mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", - meta_number, offset, MIN_PAGESIZE, retryleft); - int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, + offset, MIN_PAGESIZE, retryleft); + int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && env->me_dxb_mmap.filesize == 0 && mode_bits /* non-zero for DB creation */ != 0) - mdbx_notice("read meta: empty file (%d, %s)", err, - mdbx_strerror(err)); + NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); else - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } char again[MIN_PAGESIZE]; - err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) break; - mdbx_verbose("meta[%u] was updated, re-read it", meta_number); + VERBOSE("meta[%u] was updated, re-read it", meta_number); } if (!retryleft) { - mdbx_error("meta[%u] is too volatile, skip it", meta_number); + ERROR("meta[%u] is too volatile, skip it", meta_number); continue; } MDBX_page *const page = (MDBX_page *)buffer; MDBX_meta *const meta = page_meta(page); - rc = mdbx_validate_meta(env, meta, page, meta_number, &guess_pagesize); + rc = validate_meta(env, meta, page, meta_number, &guess_pagesize); if (rc != MDBX_SUCCESS) continue; - if ((env->me_stuck_meta < 0) - ? meta_ot(meta_bootid_match(meta) ? prefer_last : prefer_steady, - env, dest, meta) - : (meta_number == (unsigned)env->me_stuck_meta)) { + bool latch; + if (env->me_stuck_meta >= 0) + latch = (meta_number == (unsigned)env->me_stuck_meta); + else if (meta_bootid_match(meta)) + latch = meta_choice_recent( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + else + latch = meta_choice_steady( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + if (latch) { *dest = *meta; if (!lck_exclusive && !META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ - mdbx_verbose("latch meta[%u]", meta_number); + VERBOSE("latch meta[%u]", meta_number); } } @@ -14448,7 +14554,7 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, (env->me_stuck_meta < 0 && !(META_IS_STEADY(dest) || meta_weak_acceptable(env, dest, lck_exclusive)))) { - mdbx_error("%s", "no usable meta-pages, database is corrupted"); + ERROR("%s", "no usable meta-pages, database is corrupted"); if (rc == MDBX_SUCCESS) { /* TODO: try to restore the database by fully checking b-tree structure * for the each meta page, if the corresponding option was given */ @@ -14460,15 +14566,15 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, return MDBX_SUCCESS; } -__cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, - unsigned num) { - mdbx_ensure(env, is_powerof2(env->me_psize)); - mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); - mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); +__cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, + unsigned num) { + ENSURE(env, is_powerof2(env->me_psize)); + ENSURE(env, env->me_psize >= MIN_PAGESIZE); + ENSURE(env, env->me_psize <= MAX_PAGESIZE); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); memset(model, 0, env->me_psize); model->mp_pgno = num; @@ -14484,43 +14590,40 @@ __cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); model_meta->mm_geo.next = NUM_METAS; - mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); - mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); - mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); - mdbx_ensure(env, model_meta->mm_geo.grow_pv == - pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); - mdbx_ensure(env, model_meta->mm_geo.shrink_pv == - pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); + ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); + ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); + ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); + ENSURE(env, model_meta->mm_geo.grow_pv == + pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); + ENSURE(env, model_meta->mm_geo.shrink_pv == + pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); model_meta->mm_psize = env->me_psize; model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY; model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; meta_set_txnid(env, model_meta, MIN_TXNID + num); - unaligned_poke_u64(4, model_meta->mm_datasync_sign, meta_sign(model_meta)); - mdbx_assert(env, meta_checktxnid(env, model_meta, true)); + unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); + eASSERT(env, coherency_check_meta(env, model_meta, true)); return (MDBX_page *)((uint8_t *)model + env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. * Return pointer to recently (head) meta-page. */ -__cold static MDBX_meta *mdbx_init_metas(const MDBX_env *env, void *buffer) { +__cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page0 = (MDBX_page *)buffer; - MDBX_page *page1 = mdbx_meta_model(env, page0, 0); - MDBX_page *page2 = mdbx_meta_model(env, page1, 1); - mdbx_meta_model(env, page2, 2); - mdbx_assert(env, !meta_eq(env, page_meta(page0), page_meta(page1))); - mdbx_assert(env, !meta_eq(env, page_meta(page1), page_meta(page2))); - mdbx_assert(env, !meta_eq(env, page_meta(page2), page_meta(page0))); + MDBX_page *page1 = meta_model(env, page0, 0); + MDBX_page *page2 = meta_model(env, page1, 1); + meta_model(env, page2, 2); return page_meta(page2); } #if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) -static size_t mdbx_madvise_threshold(const MDBX_env *env, - const size_t largest_bytes) { +static size_t madvise_threshold(const MDBX_env *env, + const size_t largest_bytes) { /* TODO: use options */ const unsigned factor = 9; const size_t threshold = (largest_bytes < (65536ul << factor)) @@ -14532,20 +14635,19 @@ static size_t mdbx_madvise_threshold(const MDBX_env *env, } #endif /* MDBX_ENABLE_MADVISE */ -static int mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending) { - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); +static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, + meta_troika_t *const troika) { + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - const MDBX_meta *const head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, troika); int rc; - mdbx_assert(env, meta_eq_mask(env) == 0); - mdbx_assert(env, - pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); - mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); + eASSERT(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); + eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); + eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now); if (flags & MDBX_SAFE_NOSYNC) { /* Check auto-sync conditions */ @@ -14557,7 +14659,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ @@ -14566,10 +14668,11 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, pgno_t shrink = 0; if (flags & MDBX_SHRINK_ALLOWED) { /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = mdbx_find_largest( - env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next - : pending->mm_geo.next); - mdbx_assert(env, largest_pgno >= NUM_METAS); + const pgno_t largest_pgno = find_largest_snapshot( + env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) + ? head.ptr_c->mm_geo.next + : pending->mm_geo.next); + eASSERT(env, largest_pgno >= NUM_METAS); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) const pgno_t edge = env->me_poison_edge; if (edge > largest_pgno) { @@ -14585,31 +14688,29 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) const size_t largest_bytes = pgno2bytes(env, largest_pgno); /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes); + const size_t threshold = madvise_threshold(env, largest_bytes); const size_t discard_edge_bytes = bytes_align2os_bytes( env, ((MDBX_RDONLY & (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak : env->me_flags)) ? largest_bytes - : largest_bytes + madvise_threshold)); + : largest_bytes + threshold)); const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); const pgno_t prev_discarded_pgno = atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno >= - discard_edge_pgno + bytes2pgno(env, madvise_threshold)) { - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, - prev_discarded_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { + NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, + prev_discarded_pgno); atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, mo_Relaxed); const size_t prev_discarded_bytes = ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); - mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); + ENSURE(env, prev_discarded_bytes > discard_edge_bytes); #if defined(MADV_DONTNEED) int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version > 0x04050000) + if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) advise = MADV_FREE; #endif /* MADV_FREE */ int err = madvise(env->me_map + discard_edge_bytes, @@ -14645,26 +14746,25 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, const pgno_t bottom = (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; if (pending->mm_geo.now > bottom) { - if (META_IS_STEADY(meta_prefer_steady(env))) + if (TROIKA_HAVE_STEADY(troika)) /* force steady, but only if steady-checkpoint is present */ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; shrink = pending->mm_geo.now - bottom; pending->mm_geo.now = bottom; - if (unlikely(constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a))) { - const txnid_t txnid = - safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); - mdbx_notice("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - unaligned_peek_u64(4, pending->mm_txnid_a), txnid); - mdbx_ensure(env, env->me_txn0->mt_owner != mdbx_thread_self() && - !env->me_txn); + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + pending->unsafe_txnid, txnid); + ENSURE(env, !env->me_txn0 || + (env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn)); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto fail; } meta_set_txnid(env, pending, txnid); - mdbx_assert(env, meta_checktxnid(env, pending, true)); + eASSERT(env, coherency_check_meta(env, pending, true)); } } } @@ -14674,11 +14774,12 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; if ((flags & MDBX_SAFE_NOSYNC) == 0) { mode_bits = MDBX_SYNC_DATA; - if (pending->mm_geo.next > meta_prefer_steady(env)->mm_geo.now) + if (pending->mm_geo.next > + meta_prefer_steady(env, troika).ptr_c->mm_geo.now) mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; @@ -14688,111 +14789,107 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) rc = - mdbx_msync(&env->me_dxb_mmap, 0, + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); else - rc = mdbx_fsync(env->me_lazy_fd, mode_bits); + rc = osal_fsync(env->me_lazy_fd, mode_bits); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; } - mdbx_assert(env, meta_checktxnid(env, pending, true)); + eASSERT(env, coherency_check_meta(env, pending, true)); /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - atomic_store64(&env->me_lck->mti_sync_timestamp, mdbx_osal_monotime(), + atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), mo_Relaxed); - unaligned_poke_u64(4, pending->mm_datasync_sign, meta_sign(pending)); + unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } + const bool legal4overwrite = + head.txnid == pending->unsafe_txnid && + memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) == + 0 && + memcmp(&head.ptr_c->mm_canary, &pending->mm_canary, + sizeof(pending->mm_canary)) == 0 && + memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) == + 0; MDBX_meta *target = nullptr; - if (constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a)) { - mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); - mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(pending->mm_geo)) == 0); - if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) - target = (MDBX_meta *)head; + if (head.txnid == pending->unsafe_txnid) { + ENSURE(env, legal4overwrite); + if (!head.is_steady && META_IS_STEADY(pending)) + target = (MDBX_meta *)head.ptr_c; else { - mdbx_ensure(env, meta_eq(env, head, pending)); - mdbx_debug("%s", "skip update meta"); + WARNING("%s", "skip update meta"); return MDBX_SUCCESS; } - } else if (head == meta0) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta1, meta2); - else if (head == meta1) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta2); - else { - mdbx_assert(env, head == meta2); - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta1); + } else { + const unsigned troika_tail = troika->tail_and_flags & 3; + ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && + troika_tail != troika->prefer_steady); + target = (MDBX_meta *)meta_tail(env, troika).ptr_c; } /* LY: step#2 - update meta-page. */ - mdbx_debug( - "writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, - pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, - pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), - unaligned_peek_u64(4, pending->mm_txnid_a), mdbx_durable_str(pending)); + DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, + pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, + pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), + pending->unsafe_txnid, durable_caption(pending)); - mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta0 == head) ? "head" - : (meta0 == target) ? "tail" - : "stay", - mdbx_durable_str(meta0), meta_txnid(env, meta0), - meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta1 == head) ? "head" - : (meta1 == target) ? "tail" - : "stay", - mdbx_durable_str(meta1), meta_txnid(env, meta1), - meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta2 == head) ? "head" - : (meta2 == target) ? "tail" - : "stay", - mdbx_durable_str(meta2), meta_txnid(env, meta2), - meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta0 == head.ptr_c) ? "head" + : (meta0 == target) ? "tail" + : "stay", + durable_caption(meta0), constmeta_txnid(meta0), + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta1 == head.ptr_c) ? "head" + : (meta1 == target) ? "tail" + : "stay", + durable_caption(meta1), constmeta_txnid(meta1), + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta2 == head.ptr_c) ? "head" + : (meta2 == target) ? "tail" + : "stay", + durable_caption(meta2), constmeta_txnid(meta2), + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); - mdbx_assert(env, !meta_eq(env, pending, meta0)); - mdbx_assert(env, !meta_eq(env, pending, meta1)); - mdbx_assert(env, !meta_eq(env, pending, meta2)); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta0))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta1))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta2))); - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - mdbx_ensure(env, - target == head || constmeta_txnid(env, target) < - unaligned_peek_u64(4, pending->mm_txnid_a)); + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + ENSURE(env, target == head.ptr_c || + constmeta_txnid(target) < pending->unsafe_txnid); #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { - mdbx_jitter4testing(true); - if (likely(target != head)) { + jitter4testing(true); + if (likely(target != head.ptr_c)) { /* LY: 'invalidate' the meta. */ - meta_update_begin(env, target, - unaligned_peek_u64(4, pending->mm_txnid_a)); - unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK); + meta_update_begin(env, target, pending->unsafe_txnid); + unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK); #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * to allow readers catch actual pagesize. */ uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; - uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; + uint8_t *provoke_end = (uint8_t *)&target->mm_sign; memset(provoke_begin, 0xCC, provoke_end - provoke_begin); - mdbx_jitter4testing(false); + jitter4testing(false); #endif /* LY: update info */ @@ -14801,31 +14898,24 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); - mdbx_jitter4testing(true); + jitter4testing(true); /* LY: 'commit' the meta */ meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); - mdbx_jitter4testing(true); - mdbx_assert(env, meta_checktxnid(env, target, true)); + jitter4testing(true); + eASSERT(env, coherency_check_meta(env, target, true)); } else { - /* dangerous case (target == head), only mm_datasync_sign could + /* dangerous case (target == head), only mm_sign could * me updated, check assertions once again */ - mdbx_ensure(env, constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a) && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); - mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(head->mm_geo)) == 0); - mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); + eASSERT(env, + legal4overwrite && !head.is_steady && META_IS_STEADY(pending)); } - memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); - mdbx_flush_incoherent_cpu_writeback(); - mdbx_jitter4testing(true); + memcpy(target->mm_sign, pending->mm_sign, 8); + osal_flush_incoherent_cpu_writeback(); + jitter4testing(true); /* sync meta-pages */ rc = - mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) @@ -14838,39 +14928,53 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta), + rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: - mdbx_debug("%s", "write failed, disk error?"); + DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta), + osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } - mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; } - mdbx_assert(env, meta_checktxnid(env, target, true)); + } + + uint64_t timestamp = 0; + while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { + rc = + coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto fail; } env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a) - + (uint32_t)pending->unsafe_txnid - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); + *troika = meta_tap(env); + for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) + if (troika != &txn->tw.troika) + txn->tw.troika = *troika; + /* LY: shrink datafile if needed */ if (unlikely(shrink)) { - mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", - pending->mm_geo.now, shrink); - rc = mdbx_mapresize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper); + VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->mm_geo.now, shrink); + rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; - mdbx_assert(env, meta_checktxnid(env, target, true)); + eASSERT(env, coherency_check_meta(env, target, true)); } MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -14898,23 +15002,23 @@ static void recalculate_merge_threshold(MDBX_env *env) { : bytes / 4 /* 25 % */)); } -__cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { +__cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); - mdbx_ensure(env, is_powerof2(pagesize)); - mdbx_ensure(env, pagesize >= MIN_PAGESIZE); - mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + ENSURE(env, is_powerof2(pagesize)); + ENSURE(env, pagesize >= MIN_PAGESIZE); + ENSURE(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT); const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, maxgc_ov1page > 42 && - maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); + ENSURE(env, + maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); @@ -14924,16 +15028,15 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); - mdbx_ensure(env, - branch_nodemax > (intptr_t)(NODESIZE + 42) && + ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && branch_nodemax % 2 == 0 && leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); env->me_leaf_nodemax = (unsigned)leaf_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); - mdbx_assert(env, pgno2bytes(env, 1) == pagesize); - mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); + eASSERT(env, pgno2bytes(env, 1) == pagesize); + eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); recalculate_merge_threshold(env); const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); @@ -14942,7 +15045,7 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { intptr_t total_ram_pages, avail_ram_pages; int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); if (unlikely(err != MDBX_SUCCESS)) - mdbx_error("mdbx_get_sysraminfo(), rc %d", err); + ERROR("mdbx_get_sysraminfo(), rc %d", err); else { size_t reasonable_dpl_limit = (size_t)(total_ram_pages + avail_ram_pages) / 42; @@ -14974,7 +15077,7 @@ lckless_stub(const MDBX_env *env) { } __cold int mdbx_env_create(MDBX_env **penv) { - MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; @@ -14983,7 +15086,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_lazy_fd = INVALID_HANDLE_VALUE; env->me_dsync_fd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE; - env->me_pid = mdbx_getpid(); + env->me_pid = osal_getpid(); env->me_stuck_meta = -1; env->me_options.dp_reserve_limit = 1024; @@ -15001,37 +15104,37 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; int rc; - const size_t os_psize = mdbx_syspagesize(); + const size_t os_psize = osal_syspagesize(); if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); rc = MDBX_INCOMPATIBLE; goto bailout; } env->me_os_psize = (unsigned)os_psize; - mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize - : MAX_PAGESIZE); + setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize + : MAX_PAGESIZE); - rc = mdbx_fastmutex_init(&env->me_dbi_lock); + rc = osal_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_Init(&env->me_remap_guard); + osal_srwlock_Init(&env->me_remap_guard); InitializeCriticalSection(&env->me_windowsbug_lock); #else - rc = mdbx_fastmutex_init(&env->me_remap_guard); + rc = osal_fastmutex_init(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - rc = mdbx_ipclock_stub(&stub->mti_wlock); + rc = osal_ipclock_stub(&stub->mti_wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_remap_guard); - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_remap_guard); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #endif /* Windows */ @@ -15042,7 +15145,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { return MDBX_SUCCESS; bailout: - mdbx_free(env); + osal_free(env); *penv = nullptr; return rc; } @@ -15087,7 +15190,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, return rc; const bool inside_txn = - (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()); + (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); #if MDBX_DEBUG if (growth_step < 0) { @@ -15109,33 +15212,36 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(err != MDBX_SUCCESS)) return err; need_unlock = true; - } - const MDBX_meta *head = constmeta_prefer_last(env); - if (!inside_txn) { - env->me_txn0->mt_txnid = constmeta_txnid(env, head); - mdbx_find_oldest(env->me_txn0); + env->me_txn0->tw.troika = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); + env->me_txn0->mt_txnid = + env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent]; + txn_oldest_reader(env->me_txn0); } - /* get untouched params from DB */ + /* get untouched params from current TXN or DB */ if (pagesize <= 0 || pagesize >= INT_MAX) pagesize = env->me_psize; + const MDBX_geo *const geo = + inside_txn ? &env->me_txn->mt_geo + : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo; if (size_lower < 0) - size_lower = pgno2bytes(env, head->mm_geo.lower); + size_lower = pgno2bytes(env, geo->lower); if (size_now < 0) - size_now = pgno2bytes(env, head->mm_geo.now); + size_now = pgno2bytes(env, geo->now); if (size_upper < 0) - size_upper = pgno2bytes(env, head->mm_geo.upper); + size_upper = pgno2bytes(env, geo->upper); if (growth_step < 0) - growth_step = pgno2bytes(env, pv2pages(head->mm_geo.grow_pv)); + growth_step = pgno2bytes(env, pv2pages(geo->grow_pv)); if (shrink_threshold < 0) - shrink_threshold = pgno2bytes(env, pv2pages(head->mm_geo.shrink_pv)); + shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv)); if (pagesize != (intptr_t)env->me_psize) { rc = MDBX_EINVAL; goto bailout; } const size_t usedbytes = - pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); + pgno2bytes(env, find_largest_snapshot(env, geo->next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; @@ -15155,7 +15261,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pagesize = env->me_os_psize; if ((uintptr_t)pagesize > MAX_PAGESIZE) pagesize = MAX_PAGESIZE; - mdbx_assert(env, (uintptr_t)pagesize >= MIN_PAGESIZE); + eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE); } else if (pagesize == 0 /* minimal */) pagesize = MIN_PAGESIZE; @@ -15229,8 +15335,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { - rc = MDBX_EINVAL; - goto bailout; + size_lower = pagesize * MIN_PAGENO; + if (unlikely(size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; + } + if (size_now < size_lower) + size_now = size_lower; } if (unlikely((size_t)size_upper > MAX_MAPSIZE || @@ -15260,7 +15371,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if ((size_t)size_upper < (size_t)size_lower) size_lower = size_upper; } - mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0); + eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0); if (size_now < size_lower) size_now = size_lower; @@ -15289,7 +15400,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (!env->me_map) { /* save user's geo-params for future open/create */ if (pagesize != (intptr_t)env->me_psize) - mdbx_setup_pagesize(env, pagesize); + setup_pagesize(env, pagesize); env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; env->me_dbgeo.upper = size_upper; @@ -15298,52 +15409,52 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); - mdbx_ensure(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); + ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, - env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); - mdbx_ensure(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); + ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); - mdbx_ensure(env, env->me_dbgeo.now % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0); rc = MDBX_SUCCESS; } else { /* apply new params to opened environment */ - mdbx_ensure(env, pagesize == (intptr_t)env->me_psize); + ENSURE(env, pagesize == (intptr_t)env->me_psize); MDBX_meta meta; memset(&meta, 0, sizeof(meta)); const MDBX_geo *current_geo; if (!inside_txn) { - mdbx_assert(env, need_unlock); - const MDBX_meta *head = constmeta_prefer_last(env); + eASSERT(env, need_unlock); + const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; while ("workaround for " "todo4recovery://erased_by_github/libmdbx/issues/269") { - meta = *head; - rc = meta_waittxnid(env, &meta, ×tamp); + meta = *head.ptr_c; + rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, + ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - const txnid_t txnid = safe64_txnid_next(constmeta_txnid(env, &meta)); + const txnid_t txnid = safe64_txnid_next(head.txnid); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } meta_set_txnid(env, &meta, txnid); @@ -15360,22 +15471,19 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); new_geo.next = current_geo->next; - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); - mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); - mdbx_ensure(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); - mdbx_ensure(env, - new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); + ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); + ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); + ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); + ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); + ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); - mdbx_ensure(env, (size_t)size_lower >= MIN_MAPSIZE); - mdbx_ensure(env, new_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, (size_t)size_upper <= MAX_MAPSIZE); - mdbx_ensure(env, new_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, new_geo.now >= new_geo.next); - mdbx_ensure(env, new_geo.upper >= new_geo.now); - mdbx_ensure(env, new_geo.now >= new_geo.lower); + ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); + ENSURE(env, new_geo.lower >= MIN_PAGENO); + ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); + ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, new_geo.now >= new_geo.next); + ENSURE(env, new_geo.upper >= new_geo.now); + ENSURE(env, new_geo.now >= new_geo.lower); if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { #if defined(_WIN32) || defined(_WIN64) @@ -15387,7 +15495,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EPERM; goto bailout; } - int err = mdbx_rdt_lock(env); + int err = osal_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(err))) { rc = err; goto bailout; @@ -15408,7 +15516,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } } - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -15416,8 +15524,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper, - false); + rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper, + false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -15426,7 +15534,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_txn->mt_flags |= MDBX_TXN_DIRTY; } else { meta.mm_geo = new_geo; - rc = mdbx_sync_locked(env, env->me_flags, &meta); + rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); } if (likely(rc == MDBX_SUCCESS)) { @@ -15471,23 +15579,23 @@ __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { __cold static int alloc_page_buf(MDBX_env *env) { return env->me_pbuf ? MDBX_SUCCESS - : mdbx_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, + : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, &env->me_pbuf); } /* Further setup required for opening an MDBX environment */ -__cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, - const mdbx_mode_t mode_bits) { - MDBX_meta meta; +__cold static int setup_dxb(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { + MDBX_meta header; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + int err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) return err; - mdbx_debug("%s", "create new database"); + DEBUG("%s", "create new database"); rc = /* new database */ MDBX_RESULT_TRUE; if (!env->me_dbgeo.now) { @@ -15501,49 +15609,52 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) return err; - meta = *mdbx_init_metas(env, env->me_pbuf); - err = mdbx_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, + header = *init_metas(env, env->me_pbuf); + err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_lazy_fd, - env->me_dxb_mmap.filesize = env->me_dbgeo.now); + err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = + env->me_dxb_mmap.current = + env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; #ifndef NDEBUG /* just for checking */ - err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) return err; #endif } - mdbx_verbose( - "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO - "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta)); + VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN + ", %s", + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); - if (env->me_psize != meta.mm_psize) - mdbx_setup_pagesize(env, meta.mm_psize); - const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + if (env->me_psize != header.mm_psize) + setup_pagesize(env, header.mm_psize); + const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, env->me_os_psize); if ((env->me_flags & MDBX_RDONLY) /* readonly */ || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ || /* recovery mode */ env->me_stuck_meta >= 0) { /* use present params from db */ - const size_t pagesize = meta.mm_psize; + const size_t pagesize = header.mm_psize; err = mdbx_env_set_geometry( - env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize, - meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize, - pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize); + env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize, + header.mm_geo.upper * pagesize, + pv2pages(header.mm_geo.grow_pv) * pagesize, + pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured geometry from db", - err); + ERROR("%s: err %d", "could not apply preconfigured geometry from db", + err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } } else if (env->me_dbgeo.now) { @@ -15558,13 +15669,13 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, * - shrink threshold or growth step * But ignore change just a 'now/current' size. */ if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != - pgno2bytes(env, meta.mm_geo.upper) || + pgno2bytes(env, header.mm_geo.upper) || bytes_align2os_bytes(env, env->me_dbgeo.lower) != - pgno2bytes(env, meta.mm_geo.lower) || + pgno2bytes(env, header.mm_geo.lower) || bytes_align2os_bytes(env, env->me_dbgeo.shrink) != - pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)) || + pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) || bytes_align2os_bytes(env, env->me_dbgeo.grow) != - pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv))) { + pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) { if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) /* pre-shrink if enabled */ @@ -15573,84 +15684,82 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, env->me_dbgeo.upper, env->me_dbgeo.grow, - env->me_dbgeo.shrink, meta.mm_psize); + env->me_dbgeo.shrink, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured db-geometry", - err); + ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } /* update meta fields */ - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); - meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); + header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); + header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); - mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), - pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), - mdbx_durable_str(&meta)); + VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), + durable_caption(&header)); } else { /* fetch back 'now/current' size, since it was ignored during comparison * and may differ. */ - env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); + env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now); } - mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); + ENSURE(env, header.mm_geo.now >= header.mm_geo.next); } else { /* geo-params are not pre-configured by user, * get current values from the meta. */ - env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)); + env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now); + env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower); + env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)); } - mdbx_ensure(env, - pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); - mdbx_ensure(env, env->me_dbgeo.now >= used_bytes); + ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now); + ENSURE(env, env->me_dbgeo.now >= used_bytes); const uint64_t filesize_before = env->me_dxb_mmap.filesize; if (unlikely(filesize_before != env->me_dbgeo.now)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p), " - "assume other process working", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); } else { - mdbx_warning("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p)", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); if (filesize_before < used_bytes) { - mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO - ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); + ERROR("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { if (filesize_before & (env->me_os_psize - 1)) { - mdbx_error("%s", "filesize should be rounded-up to system page"); + ERROR("%s", "filesize should be rounded-up to system page"); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s", "ignore filesize mismatch in readonly-mode"); + WARNING("%s", "ignore filesize mismatch in readonly-mode"); } else { - mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); + VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); } } } - mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", - bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, + bootid.y, (bootid.x | bootid.y) ? "" : "not-"); #if MDBX_ENABLE_MADVISE /* calculate readahead hint before mmap with zero redundant pages */ @@ -15659,7 +15768,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; #endif /* MDBX_ENABLE_MADVISE */ - err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + err = osal_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -15673,7 +15782,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, return err; #endif /* MADV_DONTDUMP */ #if defined(MADV_DODUMP) - if (mdbx_runtime_flags & MDBX_DBG_DUMP) { + if (runtime_flags & MDBX_DBG_DUMP) { const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) ? ignore_enosys(errno) @@ -15689,8 +15798,8 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); #endif /* MDBX_USE_VALGRIND */ - mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && - used_bytes <= env->me_dxb_mmap.limit); + eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->me_dxb_mmap.limit); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { @@ -15705,198 +15814,211 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, : env->me_dxb_mmap.limit); #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + meta_troika_t troika = meta_tap(env); +#if MDBX_DEBUG + meta_troika_dump(env, &troika); +#endif + eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { /* recovery mode */ MDBX_meta clone; MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta); - err = mdbx_validate_meta_copy(env, target, &clone); + err = validate_meta_copy(env, target, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("target meta[%u] is corrupted", - bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + ERROR("target meta[%u] is corrupted", + bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } } else /* not recovery mode */ while (1) { - const unsigned meta_clash_mask = meta_eq_mask(env); + const unsigned meta_clash_mask = meta_eq_mask(&troika); if (unlikely(meta_clash_mask)) { - mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { /* non-exclusive mode, * meta-pages should be validated by a first process opened the DB */ - volatile const MDBX_meta *const head = meta_prefer_last(env); - volatile const MDBX_meta *const steady = meta_prefer_steady(env); - const txnid_t head_txnid = meta_txnid(env, head); - const txnid_t steady_txnid = meta_txnid(env, steady); - if (head_txnid == steady_txnid) + if (troika.recent == troika.prefer_steady) break; if (!env->me_lck_mmap.lck) { /* LY: without-lck (read-only) mode, so it is impossible that other * process made weak checkpoint. */ - mdbx_error("%s", "without-lck, unable recovery/rollback"); + ERROR("%s", "without-lck, unable recovery/rollback"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ - mdbx_verbose("%s", "assume collision or online weak checkpoint"); + VERBOSE("%s", "assume collision or online weak checkpoint"); break; } - mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE); + eASSERT(env, lck_rc == MDBX_RESULT_TRUE); /* exclusive mode */ + const meta_ptr_t recent = meta_recent(env, &troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); MDBX_meta clone; - const MDBX_meta *const steady = constmeta_prefer_steady(env); - const MDBX_meta *const head = constmeta_prefer_last(env); - const txnid_t steady_txnid = meta_txnid(env, steady); - if (META_IS_STEADY(steady)) { - err = mdbx_validate_meta_copy(env, steady, &clone); + if (prefer_steady.is_steady) { + err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("meta[%u] with %s txnid %" PRIaTXN - " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady", - steady_txnid, "manual recovery"); + ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", + bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), + "steady", prefer_steady.txnid, "manual recovery"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } - if (steady == head) + if (prefer_steady.ptr_c == recent.ptr_c) break; } - const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map); - const txnid_t head_txnid = meta_txnid(env, head); - const bool head_valid = - mdbx_validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; - mdbx_assert(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); - if (unlikely(!head_valid)) { - if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s for open or automatic rollback, %s", - "there are no suitable meta-pages", - "manual recovery is required"); + const pgno_t pgno = + bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map); + const bool last_valid = + validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; + eASSERT(env, + !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); + if (unlikely(!last_valid)) { + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s for open or automatic rollback, %s", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } - mdbx_warning("meta[%u] with last txnid %" PRIaTXN - " is corrupted, rollback needed", - pgno, head_txnid); + WARNING("meta[%u] with last txnid %" PRIaTXN + " is corrupted, rollback needed", + pgno, recent.txnid); + meta_troika_dump(env, &troika); goto purge_meta_head; } - if (meta_bootid_match(head)) { + if (meta_bootid_match(recent.ptr_c)) { if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ", but unable in read-only mode"); + ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, + ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ""); - meta = clone; - atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next, + WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, ""); + header = clone; + atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, mo_Relaxed); break; } - if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s, but %s for automatic rollback: %s", - "opening after an unclean shutdown", - "there are no suitable meta-pages", - "manual recovery is required"); + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s, but %s for automatic rollback: %s", + "opening after an unclean shutdown", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s and rollback needed: (from head %" PRIaTXN - " to steady %" PRIaTXN ")%s", - "opening after an unclean shutdown", head_txnid, - steady_txnid, ", but unable in read-only mode"); + ERROR("%s and rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN ")%s", + "opening after an unclean shutdown", recent.txnid, + prefer_steady.txnid, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } purge_meta_head: - mdbx_notice("%s and doing automatic rollback: " - "purge%s meta[%u] with%s txnid %" PRIaTXN, - "opening after an unclean shutdown", - head_valid ? "" : " invalid", pgno, head_valid ? " weak" : "", - head_txnid); - mdbx_ensure(env, META_IS_STEADY(steady)); - err = mdbx_override_meta(env, pgno, 0, head_valid ? head : steady); + NOTICE("%s and doing automatic rollback: " + "purge%s meta[%u] with%s txnid %" PRIaTXN, + "opening after an unclean shutdown", last_valid ? "" : " invalid", + pgno, last_valid ? " weak" : "", recent.txnid); + meta_troika_dump(env, &troika); + ENSURE(env, prefer_steady.is_steady); + err = override_meta(env, pgno, 0, + last_valid ? recent.ptr_c : prefer_steady.ptr_c); if (err) { - mdbx_error("rollback: overwrite meta[%u] with txnid %" PRIaTXN - ", error %d", - pgno, head_txnid, err); + ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", + pgno, recent.txnid, err); return err; } - mdbx_ensure(env, 0 == meta_txnid(env, head)); - mdbx_ensure(env, 0 == meta_eq_mask(env)); + troika = meta_tap(env); + ENSURE(env, 0 == meta_txnid(recent.ptr_v)); + ENSURE(env, 0 == meta_eq_mask(&troika)); } if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { //-------------------------------------------------- shrink DB & update geo - const MDBX_meta *head = constmeta_prefer_last(env); /* re-check size after mmap */ if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || env->me_dxb_mmap.current < used_bytes) { - mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR, - env->me_dxb_mmap.current); + ERROR("unacceptable/unexpected datafile size %" PRIuPTR, + env->me_dxb_mmap.current); return MDBX_PROBLEM; } if (env->me_dxb_mmap.current != env->me_dbgeo.now) { - meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); - mdbx_notice("need update meta-geo to filesize %" PRIuPTR - " bytes, %" PRIaPGNO " pages", - env->me_dxb_mmap.current, meta.mm_geo.now); + header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); + NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dxb_mmap.current, header.mm_geo.now); } - if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + const meta_ptr_t recent = meta_recent(env, &troika); + if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) { if ((env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) { - mdbx_warning( - "skipped update meta.geo in %s mode: from l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", - (env->me_stuck_meta < 0) ? "read-only" : "recovery", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), - meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); + WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", + (env->me_stuck_meta < 0) ? "read-only" : "recovery", + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower, + header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv)); } else { - const txnid_t txnid = constmeta_txnid(env, head); - const txnid_t next_txnid = safe64_txnid_next(txnid); - if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + const txnid_t next_txnid = safe64_txnid_next(recent.txnid); + if (unlikely(next_txnid > MAX_TXNID)) { + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - mdbx_notice("updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + NOTICE("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), next_txnid); - mdbx_ensure(env, meta_eq(env, &meta, head)); - meta_set_txnid(env, &meta, next_txnid); - err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); + ENSURE(env, header.unsafe_txnid == recent.txnid); + meta_set_txnid(env, &header, next_txnid); + err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, + &troika); if (err) { - mdbx_error("error %d, while updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - err, head->mm_geo.lower, head->mm_geo.now, - head->mm_geo.upper, pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + ERROR("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid); return err; } } @@ -15906,27 +16028,28 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { - MDBX_meta *const pmeta = METAPAGE(env, n); - if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) != + MDBX_meta *const meta = METAPAGE(env, n); + if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != MDBX_DATA_MAGIC)) { - const txnid_t txnid = meta_txnid(env, pmeta); - mdbx_notice("%s %s" - "meta[%u], txnid %" PRIaTXN, - "updating db-format signature for", - META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid); - err = mdbx_override_meta(env, n, txnid, pmeta); + const txnid_t txnid = constmeta_txnid(meta); + NOTICE("%s %s" + "meta[%u], txnid %" PRIaTXN, + "updating db-format signature for", + META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid); + err = override_meta(env, n, txnid, meta); if (unlikely(err != MDBX_SUCCESS) && /* Just ignore the MDBX_PROBLEM error, since here it is * returned only in case of the attempt to upgrade an obsolete * meta-page that is invalid for current state of a DB, * e.g. after shrinking DB file */ err != MDBX_PROBLEM) { - mdbx_error("%s meta[%u], txnid %" PRIaTXN ", error %d", - "updating db-format signature for", n, txnid, err); + ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", + "updating db-format signature for", n, txnid, err); return err; } + troika = meta_tap(env); } } } @@ -15938,9 +16061,9 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #if defined(MADV_REMOVE) if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && /* not recovery mode */ env->me_stuck_meta < 0) { - mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) @@ -15951,9 +16074,9 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, } #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "DONTNEED", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) @@ -15976,7 +16099,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MADV_DONTNEED */ } - err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); if (unlikely(err != MDBX_SUCCESS)) return err; #endif /* MDBX_ENABLE_MADVISE */ @@ -15987,12 +16110,12 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -__cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, - mdbx_mode_t mode) { - mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); +__cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, + mdbx_mode_t mode) { + eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); - int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); + int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); if (err != MDBX_SUCCESS) { switch (err) { default: @@ -16010,8 +16133,8 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } if (err != MDBX_ENOFILE) { - /* ensure the file system is read-only */ - err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); if (err != MDBX_SUCCESS && /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) @@ -16021,12 +16144,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ /* beginning of a locked section ---------------------------------------- */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); env->me_lfd = INVALID_HANDLE_VALUE; - const int rc = mdbx_lck_seize(env); + const int rc = osal_lck_seize(env); if (MDBX_IS_ERROR(rc)) { /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return rc; } @@ -16038,23 +16161,23 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_lck = lckless_stub(env); env->me_maxreaders = UINT_MAX; - mdbx_debug("lck-setup:%s%s%s", " lck-less", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + DEBUG("lck-setup:%s%s%s", " lck-less", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); return rc; } /* beginning of a locked section ------------------------------------------ */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - err = mdbx_lck_seize(env); + err = osal_lck_seize(env); if (MDBX_IS_ERROR(err)) { bailout: /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return err; } @@ -16065,7 +16188,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (MDBX_IS_ERROR(err)) goto bailout; if (inprocess_neighbor && - ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { err = MDBX_BUSY; goto bailout; @@ -16073,13 +16196,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } const int lck_seize_rc = err; - mdbx_debug("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" - : " cooperative"); + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size = 0; - err = mdbx_filesize(env->me_lfd, &size); + err = osal_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) goto bailout; @@ -16087,7 +16209,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); - mdbx_jitter4testing(false); + jitter4testing(false); } else { if (env->me_flags & MDBX_EXCLUSIVE) { err = MDBX_BUSY; @@ -16095,7 +16217,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { - mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size); + ERROR("lck-file has invalid size %" PRIu64 " bytes", size); err = MDBX_PROBLEM; goto bailout; } @@ -16104,7 +16226,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, const size_t maxreaders = ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); if (maxreaders < 4) { - mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); + ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); err = MDBX_PROBLEM; goto bailout; } @@ -16112,7 +16234,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT; - err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, + err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size, lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE); @@ -16145,55 +16267,54 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (lck_seize_rc == MDBX_RESULT_TRUE) { /* LY: exclusive mode, check and reset lck content */ memset(lck, 0, (size_t)size); - mdbx_jitter4testing(false); + jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; lck->mti_os_and_format = MDBX_LOCK_FORMAT; #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "msync"); + ERROR("initial-%s for lck-file failed", "msync"); goto bailout; } - err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); + err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "fsync"); + ERROR("initial-%s for lck-file failed", "fsync"); goto bailout; } } else { if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC; - mdbx_error( - "lock region has %s", - invalid - ? "invalid magic" - : "incompatible version (only applications with nearly or the " - "same versions of libmdbx can share the same database)"); + ERROR("lock region has %s", + invalid + ? "invalid magic" + : "incompatible version (only applications with nearly or the " + "same versions of libmdbx can share the same database)"); err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; goto bailout; } if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { - mdbx_error("lock region has os/format signature 0x%" PRIx32 - ", expected 0x%" PRIx32, - lck->mti_os_and_format, MDBX_LOCK_FORMAT); + ERROR("lock region has os/format signature 0x%" PRIx32 + ", expected 0x%" PRIx32, + lck->mti_os_and_format, MDBX_LOCK_FORMAT); err = MDBX_VERSION_MISMATCH; goto bailout; } } - err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc); + err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); if (MDBX_IS_ERROR(err)) goto bailout; - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); /* insert into inprocess lck-list */ env->me_lcklist_next = inprocess_lcklist_head; inprocess_lcklist_head = env; lcklist_unlock(); /* end of a locked section ------------------------------------------------ */ - mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc)); + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); env->me_lck = lck; return lck_seize_rc; } @@ -16251,24 +16372,26 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { return r; } -__cold static int __must_check_result mdbx_override_meta( - MDBX_env *env, unsigned target, txnid_t txnid, const MDBX_meta *shape) { +__cold static int __must_check_result override_meta(MDBX_env *env, + unsigned target, + txnid_t txnid, + const MDBX_meta *shape) { int rc = alloc_page_buf(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *const page = env->me_pbuf; - mdbx_meta_model(env, page, target); + meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); - mdbx_assert(env, meta_checktxnid(env, model, true)); + eASSERT(env, coherency_check_meta(env, model, true)); if (shape) { - if (txnid && unlikely(!meta_checktxnid(env, shape, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "pre", constmeta_txnid(env, shape)); + if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { + ERROR("bailout overriding meta-%u since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; } - if (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) + if (runtime_flags & MDBX_DBG_DONT_UPGRADE) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); model->mm_extra_flags = shape->mm_extra_flags; @@ -16286,16 +16409,16 @@ __cold static int __must_check_result mdbx_override_meta( model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); - if (unlikely(!meta_checktxnid(env, model, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "post", txnid); + if (unlikely(!coherency_check_meta(env, model, false))) { + ERROR("bailout overriding meta-%u since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "post", txnid); return MDBX_PROBLEM; } } } - unaligned_poke_u64(4, model->mm_datasync_sign, meta_sign(model)); - rc = mdbx_validate_meta(env, model, page, target, nullptr); + unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); + rc = validate_meta(env, model, page, target, nullptr); if (unlikely(MDBX_IS_ERROR(rc))) return MDBX_PROBLEM; @@ -16306,28 +16429,29 @@ __cold static int __must_check_result mdbx_override_meta( env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { - rc = mdbx_msync(&env->me_dxb_mmap, 0, + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, model->mm_geo.next), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* mdbx_override_meta() called only while current process have exclusive + /* override_meta() called only while current process have exclusive * lock of a DB file. So meta-page could be updated directly without * clearing consistency flag by mdbx_meta_update_begin() */ memcpy(pgno2page(env, target), page, env->me_psize); - mdbx_flush_incoherent_cpu_writeback(); - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), + osal_flush_incoherent_cpu_writeback(); + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) ? env->me_dsync_fd : env->me_lazy_fd; - rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); + rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); + eASSERT(env, !env->me_txn && !env->me_txn0); return rc; } @@ -16343,32 +16467,42 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { return MDBX_EPERM; const MDBX_meta *target_meta = METAPAGE(env, target); - txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(env, target_meta)); + txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta)); for (unsigned n = 0; n < NUM_METAS; ++n) { - MDBX_page *page = pgno2page(env, n); - MDBX_meta meta = *page_meta(page); if (n == target) continue; - if (mdbx_validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { - int err = mdbx_override_meta(env, n, 0, nullptr); + MDBX_meta meta = *METAPAGE(env, target); + if (validate_meta(env, &meta, pgno2page(env, n), n, nullptr) != + MDBX_SUCCESS) { + int err = override_meta(env, n, 0, nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; } else { - txnid_t txnid = constmeta_txnid(env, &meta); + txnid_t txnid = constmeta_txnid(&meta); if (new_txnid <= txnid) new_txnid = safe64_txnid_next(txnid); } } if (unlikely(new_txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - return mdbx_override_meta(env, target, new_txnid, target_meta); + return override_meta(env, target, new_txnid, target_meta); } __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *pathnameW = nullptr; + OSAL_MB2WIDE(pathname, pathnameW); + return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); +} + +__cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, + unsigned target_meta, bool writeable) { +#endif /* Windows */ + if (unlikely(target_meta >= NUM_METAS)) return MDBX_EINVAL; int rc = check_env(env, false); @@ -16378,35 +16512,49 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, return MDBX_EPERM; env->me_stuck_meta = (int8_t)target_meta; - return mdbx_env_open( - env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, - 0); + return +#if defined(_WIN32) || defined(_WIN64) + mdbx_env_openW +#else + mdbx_env_open +#endif /* Windows */ + (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, + 0); } typedef struct { void *buffer_for_free; - char *lck, *dxb; + pathchar_t *lck, *dxb; size_t ent_len; } MDBX_handle_env_pathname; -__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, - const char *pathname, - MDBX_env_flags_t *flags, - const mdbx_mode_t mode) { - int rc; +static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { +#if defined(_WIN32) || defined(_WIN64) + while (len > 0) { + pathchar_t a = *l++; + pathchar_t b = *r++; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + +__cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, + const pathchar_t *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { memset(ctx, 0, sizeof(*ctx)); - if (unlikely(!pathname)) + if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; + int rc; #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - - const DWORD dwAttrib = GetFileAttributesW(pathnameW); + const DWORD dwAttrib = GetFileAttributesW(pathname); if (dwAttrib == INVALID_FILE_ATTRIBUTES) { rc = GetLastError(); if (rc != MDBX_ENOFILE) @@ -16416,8 +16564,7 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, return rc; /* auto-create directory if requested */ - if ((*flags & MDBX_NOSUBDIR) == 0 && - !CreateDirectoryW(pathnameW, nullptr)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; @@ -16458,41 +16605,61 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, } #endif - static const char dxb_name[] = MDBX_DATANAME; - static const size_t dxb_name_len = sizeof(dxb_name) - 1; - static const char lck_name[] = MDBX_LOCKNAME; - static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + static const pathchar_t dxb_name[] = MDBX_DATANAME; + static const pathchar_t lck_name[] = MDBX_LOCKNAME; + static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; - ctx->ent_len = strlen(pathname); - if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && - !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { +#if defined(_WIN32) || defined(_WIN64) + assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); + const size_t pathname_len = wcslen(pathname); +#else + assert(dxb_name[0] == '/' && lck_name[0] == '/'); + const size_t pathname_len = strlen(pathname); +#endif + assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/'); + ctx->ent_len = pathname_len; + static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; + if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len && + path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + dxb_name_len)) { *flags -= MDBX_NOSUBDIR; ctx->ent_len -= dxb_name_len; } const size_t bytes_needed = - ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) - ? sizeof(lock_suffix) + 1 - : sizeof(lck_name) + sizeof(dxb_name)); - ctx->buffer_for_free = mdbx_malloc(bytes_needed); + sizeof(pathchar_t) * ctx->ent_len * 2 + + ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) + : sizeof(lck_name) + sizeof(dxb_name)); + ctx->buffer_for_free = osal_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; - ctx->lck = ctx->buffer_for_free; + ctx->dxb = ctx->buffer_for_free; + ctx->lck = ctx->dxb + ctx->ent_len + 1; + memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); if (*flags & MDBX_NOSUBDIR) { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); - sprintf(ctx->lck, "%s%s", pathname, lock_suffix); - strcpy(ctx->dxb, pathname); + memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); } else { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); - sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); - sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); + ctx->lck += dxb_name_len; + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); } + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); return MDBX_SUCCESS; } __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *pathnameW = nullptr; + OSAL_MB2WIDE(pathname, pathnameW); + return mdbx_env_deleteW(pathnameW, mode); +} + +__cold int mdbx_env_deleteW(const wchar_t *pathname, + MDBX_env_delete_mode_t mode) { +#endif /* Windows */ + switch (mode) { default: return MDBX_EINVAL; @@ -16510,35 +16677,35 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { memset(dummy_env, 0, sizeof(*dummy_env)); dummy_env->me_flags = (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; - dummy_env->me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env->me_os_psize = (unsigned)osal_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); - dummy_env->me_pathname = (char *)pathname; + dummy_env->me_pathname = (pathchar_t *)pathname; MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); int rc = MDBX_RESULT_TRUE, - err = mdbx_handle_env_pathname( - &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + err = handle_env_pathname(&env_pathname, pathname, + (MDBX_env_flags_t *)&dummy_env->me_flags, 0); if (likely(err == MDBX_SUCCESS)) { mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, dxb_handle = INVALID_HANDLE_VALUE; if (mode > MDBX_ENV_JUST_DELETE) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, &dxb_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; if (err == MDBX_SUCCESS) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, &clk_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; } if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.dxb); + err = osal_removefile(env_pathname.dxb); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -16546,7 +16713,7 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.lck); + err = osal_removefile(env_pathname.lck); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -16554,7 +16721,7 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { - err = mdbx_removedirectory(pathname); + err = osal_removedirectory(pathname); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -16562,18 +16729,28 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (dxb_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(dxb_handle); + osal_closefile(dxb_handle); if (clk_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(clk_handle); + osal_closefile(clk_handle); } else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return (err == MDBX_SUCCESS) ? rc : err; } __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *pathnameW = nullptr; + OSAL_MB2WIDE(pathname, pathnameW); + return mdbx_env_openW(env, pathnameW, flags, mode); +} + +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + int rc = check_env(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -16581,29 +16758,21 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (unlikely(flags & ~ENV_USABLE_FLAGS)) return MDBX_EINVAL; - if (flags & MDBX_RDONLY) - mode = 0; - - if (env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map) + if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) return MDBX_EPERM; - /* pickup previously mdbx_env_set_flags(), + /* Pickup previously mdbx_env_set_flags(), * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags, env->me_flags); - - MDBX_handle_env_pathname env_pathname; - rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); if (flags & MDBX_RDONLY) { - /* LY: silently ignore irrelevant flags when - * we're only getting read access */ + /* Silently ignore irrelevant flags when we're only getting read access */ flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. @@ -16612,42 +16781,47 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; else { - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); - rc = 42 /* ENOPROTOOPT */; - goto bailout; + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + return 42 /* ENOPROTOOPT */; } } #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ } + MDBX_handle_env_pathname env_pathname; + rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); - env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); - env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); - env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); + env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); + env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); + env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); + env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && env->me_dbiseqs)) { rc = MDBX_ENOMEM; goto bailout; } - memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); + memcpy(env->me_pathname, env_pathname.dxb, + env_pathname.ent_len * sizeof(pathchar_t)); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, + rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; - mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); - mdbx_ensure(env, (rc != MDBX_SUCCESS) == - (env->me_dsync_fd == INVALID_HANDLE_VALUE)); + ENSURE(env, + (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); } #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -16674,7 +16848,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); + const int lck_rc = setup_lck(env, env_pathname.lck, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; @@ -16682,16 +16856,16 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, /* Set the position in files outside of the data to avoid corruption * due to erroneous use of file descriptors in the application code. */ - mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); - mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); + osal_fseek(env->me_lfd, UINT64_C(1) << 63); + osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63); if (env->me_dsync_fd != INVALID_HANDLE_VALUE) - mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63); const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | - MDBX_LIFORECLAIM | MDBX_COALESCE | - MDBX_NORDAHEAD; + MDBX_LIFORECLAIM | + MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { @@ -16702,11 +16876,11 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, * - let's assume that for some reason the DB file is smaller * than it should be according to the geometry, * but not smaller than the last page used; - * - the first process that opens the database (lc_rc = true) + * - the first process that opens the database (lck_rc == RESULT_TRUE) * does this in readonly mode and therefore cannot bring * the file size back to normal; - * - some next process (lc_rc = false) opens the DB in read-write - * mode and now is here. + * - some next process (lck_rc != RESULT_TRUE) opens the DB in + * read-write mode and now is here. * * FIXME: Should we re-check and set the size of DB-file right here? */ break; @@ -16715,23 +16889,22 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } if (env->me_flags & MDBX_ACCEDE) { - /* pickup current mode-flags, including MDBX_LIFORECLAIM | - * MDBX_COALESCE | MDBX_NORDAHEAD */ + /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ const unsigned diff = (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; - mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, - env->me_flags ^ diff); + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, + env->me_flags ^ diff); env->me_flags ^= diff; } if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { - mdbx_error("%s", "current mode/flags incompatible with requested"); + ERROR("%s", "current mode/flags incompatible with requested"); rc = MDBX_INCOMPATIBLE; goto bailout; } } - const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode); + const int dxb_rc = setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; @@ -16740,32 +16913,31 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (flags & MDBX_EXCLUSIVE) == 0)) { - mdbx_error("%s", "recovery requires exclusive mode"); + ERROR("%s", "recovery requires exclusive mode"); rc = MDBX_BUSY; goto bailout; } - mdbx_debug("opened dbenv %p", (void *)env); + DEBUG("opened dbenv %p", (void *)env); if (lck) { if (lck_rc == MDBX_RESULT_TRUE) { lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); - rc = mdbx_lck_downgrade(env); - mdbx_debug("lck-downgrade-%s: rc %i", - (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); + lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); + lck->mti_reader_check_timestamp.weak = osal_monotime(); + rc = osal_lck_downgrade(env); + DEBUG("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) goto bailout; } else { - rc = mdbx_cleanup_dead_readers(env, false, NULL); + rc = cleanup_dead_readers(env, false, NULL); if (MDBX_IS_ERROR(rc)) goto bailout; } if ((env->me_flags & MDBX_NOTLS) == 0) { - rc = mdbx_rthc_alloc(&env->me_txkey, &lck->mti_readers[0], - &lck->mti_readers[env->me_maxreaders]); + rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], + &lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_flags |= MDBX_ENV_TXKEY; @@ -16773,30 +16945,31 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } else { env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); + (uint32_t)recent_committed_txnid(env); + env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); } if ((flags & MDBX_RDONLY) == 0) { const size_t tsize = sizeof(MDBX_txn), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(unsigned) + 1); + sizeof(MDBX_atomic_uint32_t) + 1); rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { memset(env->me_pbuf, -1, env->me_psize * 2); - MDBX_txn *txn = mdbx_calloc(1, size); + MDBX_txn *txn = osal_calloc(1, size); if (txn) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbiseqs = + (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; - txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL); if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) rc = MDBX_ENOMEM; } else @@ -16806,51 +16979,52 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - const MDBX_meta *meta = constmeta_prefer_last(env); - const MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; + const meta_troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; - mdbx_debug("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), - env->me_psize); - mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(meta)->mp_pgno, meta_txnid(env, meta)); - mdbx_debug("depth: %u", db->md_depth); - mdbx_debug("entries: %" PRIu64, db->md_entries); - mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); - mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages); - mdbx_debug("root: %" PRIaPGNO, db->md_root); - mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), + env->me_psize); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(head.ptr_c)->mp_pgno, head.txnid); + DEBUG("depth: %u", db->md_depth); + DEBUG("entries: %" PRIu64, db->md_entries); + DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); + DEBUG("root: %" PRIaPGNO, db->md_root); + DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); } #endif bailout: if (rc != MDBX_SUCCESS) { - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; + rc = env_close(env) ? MDBX_PANIC : rc; env->me_flags = saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); } else { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif } - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int mdbx_env_close0(MDBX_env *env) { +__cold static int env_close(MDBX_env *env) { const unsigned flags = env->me_flags; if (!(flags & MDBX_ENV_ACTIVE)) { - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); return MDBX_SUCCESS; } env->me_flags &= ~ENV_INTERNAL_FLAGS; env->me_lck = nullptr; if (flags & MDBX_ENV_TXKEY) { - mdbx_rthc_remove(env->me_txkey); - env->me_txkey = (mdbx_thread_key_t)0; + rthc_remove(env->me_txkey); + env->me_txkey = (osal_thread_key_t)0; } lcklist_lock(); @@ -16858,7 +17032,7 @@ __cold static int mdbx_env_close0(MDBX_env *env) { lcklist_unlock(); if (env->me_map) { - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); #ifdef MDBX_USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -16866,52 +17040,52 @@ __cold static int mdbx_env_close0(MDBX_env *env) { } if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_dsync_fd); + (void)osal_closefile(env->me_dsync_fd); env->me_dsync_fd = INVALID_HANDLE_VALUE; } if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lazy_fd); + (void)osal_closefile(env->me_lazy_fd); env->me_lazy_fd = INVALID_HANDLE_VALUE; } if (env->me_lck_mmap.lck) - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lfd); + (void)osal_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; } if (env->me_dbxs) { for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) - mdbx_free(env->me_dbxs[i].md_name.iov_base); - mdbx_free(env->me_dbxs); + osal_free(env->me_dbxs[i].md_name.iov_base); + osal_free(env->me_dbxs); env->me_dbxs = nullptr; } if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } if (env->me_dbiseqs) { - mdbx_free(env->me_dbiseqs); + osal_free(env->me_dbiseqs); env->me_dbiseqs = nullptr; } if (env->me_dbflags) { - mdbx_free(env->me_dbflags); + osal_free(env->me_dbflags); env->me_dbflags = nullptr; } if (env->me_pathname) { - mdbx_free(env->me_pathname); + osal_free(env->me_pathname); env->me_pathname = nullptr; } if (env->me_txn0) { - mdbx_dpl_free(env->me_txn0); - mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); - mdbx_pnl_free(env->me_txn0->tw.retired_pages); - mdbx_pnl_free(env->me_txn0->tw.spill_pages); - mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist); - mdbx_free(env->me_txn0); + dpl_free(env->me_txn0); + txl_free(env->me_txn0->tw.lifo_reclaimed); + pnl_free(env->me_txn0->tw.retired_pages); + pnl_free(env->me_txn0->tw.spill_pages); + pnl_free(env->me_txn0->tw.reclaimed_pglist); + osal_free(env->me_txn0); env->me_txn0 = nullptr; } env->me_stuck_meta = -1; @@ -16933,13 +17107,13 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * platforms (i.e. where fork() is available). * This is required to legitimize a call after fork() * from a child process, that should be allowed to free resources. */ - if (unlikely(env->me_pid != mdbx_getpid())) + if (unlikely(env->me_pid != osal_getpid())) env->me_flags |= MDBX_FATAL_ERROR; #endif /* MDBX_ENV_CHECKPID */ if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && env->me_txn0) { - if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) + if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self()) return MDBX_BUSY; } else dont_sync = true; @@ -16953,14 +17127,14 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * process is running a writing transaction or not. * Because in the "owner died" condition kernel don't release * file lock immediately. */ - rc = mdbx_env_sync_internal(env, true, false); + rc = env_sync(env, true, false); rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; #else struct stat st; if (unlikely(fstat(env->me_lazy_fd, &st))) rc = errno; else if (st.st_nlink > 0 /* don't sync deleted files */) { - rc = mdbx_env_sync_internal(env, true, true); + rc = env_sync(env, true, true); rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS @@ -16969,32 +17143,31 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif } - mdbx_assert(env, env->me_signature.weak == 0); - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; - mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); + eASSERT(env, env->me_signature.weak == 0); + rc = env_close(env) ? MDBX_PANIC : rc; + ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ DeleteCriticalSection(&env->me_windowsbug_lock); #else - mdbx_ensure(env, - mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - mdbx_ensure(env, mdbx_ipclock_destroy(&stub->mti_wlock) == 0); + ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dp_reserve = dp->mp_next; - mdbx_free(dp); + osal_free(dp); } VALGRIND_DESTROY_MEMPOOL(env); - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); env->me_pid = 0; - mdbx_free(env); + osal_free(env); return rc; } @@ -17006,8 +17179,8 @@ __cold int mdbx_env_close(MDBX_env *env) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Compare two items pointing at aligned unsigned int's. */ -static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); +__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(4, a->iov_base), @@ -17023,8 +17196,8 @@ static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); +__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(2, a->iov_base), @@ -17042,8 +17215,8 @@ static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { /* Compare two items pointing at unsigned values with unknown alignment. * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); +__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(1, a->iov_base), @@ -17059,7 +17232,7 @@ static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items lexically */ -static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { if (a->iov_len == b->iov_len) return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; @@ -17070,7 +17243,7 @@ static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items in reverse byte order */ -static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; if (likely(shortest)) { const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; @@ -17086,9 +17259,9 @@ static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { } /* Fast non-lexically comparator */ -static int __hot cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { int diff = CMP2INT(a->iov_len, b->iov_len); - return likely(diff || a->iov_len == 0) + return likely(diff) || a->iov_len == 0 ? diff : memcmp(a->iov_base, b->iov_base, a->iov_len); } @@ -17105,15 +17278,15 @@ static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key) { +__hot static struct node_result node_search(MDBX_cursor *mc, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; const int nkeys = page_numkeys(mp); DKBUF_DEBUG; - mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys, - IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno); + DEBUG("searching %u keys in %s %spage %" PRIaPGNO, nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mp->mp_pgno); struct node_result ret; ret.exact = false; @@ -17126,32 +17299,31 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, return ret; } - int cr = 0, i = 0; + int i; MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; MDBX_val nodekey; if (unlikely(IS_LEAF2(mp))) { - mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); + cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); nodekey.iov_len = mp->mp_leaf2_ksize; do { i = (low + high) >> 1; nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); - cr = cmp(key, &nodekey); - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); - if (unlikely(cr == 0)) { + cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); + int cr = cmp(key, &nodekey); + DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -17168,32 +17340,29 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, MDBX_node *node; do { i = (low + high) >> 1; - node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); - - cr = cmp(key, &nodekey); + cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); + int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); + DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); else - mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, - DKEY_DEBUG(&nodekey), node_pgno(node), cr); - if (unlikely(cr == 0)) { + DEBUG("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, + DKEY_DEBUG(&nodekey), node_pgno(node), cr); + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -17203,11 +17372,11 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, } /* Pop a page off the top of the cursor's stack. */ -static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { - if (mc->mc_snum) { - mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (--mc->mc_snum) { +static __inline void cursor_pop(MDBX_cursor *mc) { + if (likely(mc->mc_snum)) { + DEBUG("popped page %" PRIaPGNO " off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); + if (likely(--mc->mc_snum)) { mc->mc_top--; } else { mc->mc_flags &= ~C_INITIALIZED; @@ -17217,42 +17386,109 @@ static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDBX_TXN_ERROR on failure. */ -static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { - mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *)mc); +static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) { + DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), + (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_CURSOR_FULL; } - mdbx_cassert(mc, mc->mc_snum < UINT16_MAX); mc->mc_top = mc->mc_snum++; mc->mc_pg[mc->mc_top] = mp; mc->mc_ki[mc->mc_top] = 0; - return MDBX_SUCCESS; } -__hot static struct page_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, - /* TODO: use parent-page ptr */ txnid_t front) { - struct page_result ret; - MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, front <= txn->mt_front); - if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); - notfound: - ret.page = nullptr; - ret.err = MDBX_PAGE_NOTFOUND; - bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return ret; +__hot static __always_inline int page_get_checker_lite(const uint16_t ILL, + const MDBX_page *page, + MDBX_txn *const txn, + const txnid_t front) { + if (unlikely(page->mp_flags & ILL)) { + if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS)) + return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags); + else if (ILL & P_OVERFLOW) { + assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "large/overlow", "branch/leaf/leaf2", page->mp_flags); + } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { + assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "branch/leaf/leaf2", "large/overlow", page->mp_flags); + } else { + assert(false); + } } - MDBX_env *const env = txn->mt_env; - mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) { + if (unlikely(page->mp_txnid > front) && + unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) + return bad_page( + page, + "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + page->mp_txnid, + (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" + : "parent-page", + front); + + if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && + (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { + if (unlikely(page->mp_upper < page->mp_lower || + ((page->mp_lower | page->mp_upper) & 1) || + PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) + return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %u\n", + page->mp_lower, page->mp_upper, page_space(txn->mt_env)); + + } else if ((ILL & P_OVERFLOW) == 0) { + const pgno_t npages = page->mp_pages; + if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) + return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno)) + return bad_page( + page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + page->mp_pgno + npages, txn->mt_next_pgno); + } else { + assert(false); + } + return MDBX_SUCCESS; +} + +__cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, + MDBX_page *page, + MDBX_cursor *const mc, + const txnid_t front) { + pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; + if (likely(r.err == MDBX_SUCCESS)) + r.err = page_check(mc, page); + if (unlikely(r.err != MDBX_SUCCESS)) + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return r; +} + +__hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, + MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front) { + MDBX_txn *const txn = mc->mc_txn; + tASSERT(txn, front <= txn->mt_front); + + pgr_t r; + if (unlikely(pgno >= txn->mt_next_pgno)) { + ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); + r.page = nullptr; + r.err = MDBX_PAGE_NOTFOUND; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return r; + } + + eASSERT(txn->mt_env, + ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); + r.page = pgno2page(txn->mt_env, pgno); + if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { const MDBX_txn *spiller = txn; do { /* Spilled pages were dirtied in this txn and flushed @@ -17260,74 +17496,45 @@ mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - mdbx_search_spilled(spiller, pgno)) { - goto spilled; - } + search_spilled(spiller, pgno)) + break; - const unsigned i = mdbx_dpl_search(spiller, pgno); - assert((int)i > 0); + const unsigned i = dpl_search(spiller, pgno); + tASSERT(txn, (int)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { - ret.page = spiller->tw.dirtylist->items[i].ptr; spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; - goto dirty; + r.page = spiller->tw.dirtylist->items[i].ptr; + break; } spiller = spiller->mt_parent; - } while (spiller != NULL); + } while (spiller); } -spilled: - ret.page = pgno2page(env, pgno); - -dirty: - if (unlikely(ret.page->mp_pgno != pgno)) { - bad_page(ret.page, - "mismatch actual pgno (%" PRIaPGNO ") != expected (%" PRIaPGNO - ")\n", - ret.page->mp_pgno, pgno); - goto notfound; - } - -#if !MDBX_DISABLE_PAGECHECKS - if (unlikely(ret.page->mp_flags & P_ILL_BITS)) { - ret.err = - bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags); + if (unlikely(r.page->mp_pgno != pgno)) { + r.err = bad_page( + r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", + r.page->mp_pgno, pgno); goto bailout; } - if (unlikely(ret.page->mp_txnid > front) && - unlikely(ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { - ret.err = bad_page( - ret.page, - "invalid page txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", - ret.page->mp_txnid, - (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" - : "parent-page", - front); - goto bailout; - } + if (unlikely(mc->mc_checking & CC_PAGECHECK)) + return page_get_checker_full(ILL, r.page, mc, front); - if (unlikely((ret.page->mp_upper < ret.page->mp_lower || - ((ret.page->mp_lower | ret.page->mp_upper) & 1) || - PAGEHDRSZ + ret.page->mp_upper > env->me_psize) && - !IS_OVERFLOW(ret.page))) { - ret.err = - bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit (%u)\n", - ret.page->mp_lower, ret.page->mp_upper, page_space(env)); +#if MDBX_DISABLE_VALIDATION + r.err = MDBX_SUCCESS; +#else + r.err = page_get_checker_lite(ILL, r.page, txn, front); + if (unlikely(r.err != MDBX_SUCCESS)) goto bailout; - } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - - ret.err = MDBX_SUCCESS; - if (mdbx_audit_enabled()) - ret.err = mdbx_page_check(mc, ret.page, C_UPDATING); - return ret; +#endif /* MDBX_DISABLE_VALIDATION */ + return r; } /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -__hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot __noinline static int page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF_DEBUG; @@ -17336,13 +17543,13 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, MDBX_node *node; int i; - mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, - page_numkeys(mp)); + DEBUG("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, + page_numkeys(mp)); /* Don't assert on branch pages in the GC. We can get here * while in the process of rebalancing a GC branch page; we must * let that proceed. ITS#8336 */ - mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1); - mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); + cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1); + DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { i = 0; @@ -17358,51 +17565,49 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, } } } else { - const struct node_result nsr = mdbx_node_search(mc, key); - if (nsr.node) + const struct node_result nsr = node_search(mc, key); + if (likely(nsr.node)) i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; else i = page_numkeys(mp) - 1; - mdbx_debug("following index %u for key [%s]", i, DKEY_DEBUG(key)); + DEBUG("following index %u for key [%s]", i, DKEY_DEBUG(key)); } - mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); + cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); node = page_node(mp, i); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = (indx_t)i; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; ready: if (flags & MDBX_PS_MODIFY) { - if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + if (unlikely((rc = page_touch(mc)) != 0)) return rc; mp = mc->mc_pg[mc->mc_top]; } } -#if !MDBX_DISABLE_PAGECHECKS - if (unlikely(!IS_LEAF(mp))) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return bad_page(mp, "index points to a page with 0x%02x flags\n", - mp->mp_flags); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, - DKEY_DEBUG(key)); + DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, + DKEY_DEBUG(key)); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; return MDBX_SUCCESS; } -static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, - const unsigned pagesize) { +static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, + const unsigned pagesize) { if (unlikely(!dbx->md_cmp)) { dbx->md_cmp = get_default_keycmp(db->md_flags); dbx->md_dcmp = get_default_datacmp(db->md_flags); @@ -17420,10 +17625,10 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, assert(dbx->md_vlen_max != (unsigned)-1); if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(db->md_xsize < dbx->md_vlen_min || + if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || db->md_xsize > dbx->md_vlen_max)) { - mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", - db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max); + ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, + dbx->md_vlen_min, dbx->md_vlen_max); return MDBX_CORRUPTED; } dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; @@ -17431,52 +17636,49 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, return MDBX_SUCCESS; } -static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { +static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { MDBX_cursor_couple couple; - if (unlikely(TXN_DBI_CHANGED(txn, dbi))) { - mdbx_notice("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); + if (unlikely(dbi_changed(txn, dbi))) { + NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); return MDBX_BAD_DBI; } - int rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + int rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; - rc = mdbx_page_search(&couple.outer, &dbx->md_name, 0); + rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { notfound: - mdbx_notice("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN - " (err %d)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, rc); + NOTICE("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN + " (err %d)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, rc); return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; } MDBX_val data; - struct node_result nsr = mdbx_node_search(&couple.outer, &dbx->md_name); + struct node_result nsr = node_search(&couple.outer, &dbx->md_name); if (unlikely(!nsr.exact)) { rc = MDBX_NOTFOUND; goto notfound; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong flags"); + NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong flags"); return MDBX_INCOMPATIBLE; /* not a named DB */ } - const txnid_t pp_txnid = - pp_txnid4chk(couple.outer.mc_pg[couple.outer.mc_top], txn); - rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid); + rc = node_read(&couple.outer, nsr.node, &data, + couple.outer.mc_pg[couple.outer.mc_top]); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(data.iov_len != sizeof(MDBX_db))) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong rec-size"); + NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong rec-size"); return MDBX_INCOMPATIBLE; /* not a named DB */ } @@ -17485,24 +17687,24 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * have dropped and recreated the DB with other flags. */ MDBX_db *const db = &txn->mt_dbs[dbi]; if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { - mdbx_notice("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN - " with different flags (present 0x%X != wanna 0x%X)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, - db->md_flags & DB_PERSISTENT_FLAGS, md_flags); + NOTICE("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN + " with different flags (present 0x%X != wanna 0x%X)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); return MDBX_INCOMPATIBLE; } memcpy(db, data.iov_base, sizeof(MDBX_db)); -#if !MDBX_DISABLE_PAGECHECKS - mdbx_tassert(txn, txn->mt_front >= pp_txnid); +#if !MDBX_DISABLE_VALIDATION + const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; + tASSERT(txn, txn->mt_front >= pp_txnid); if (unlikely(db->md_mod_txnid > pp_txnid)) { - mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", - db->md_mod_txnid, pp_txnid); + ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + db->md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize); +#endif /* !MDBX_DISABLE_VALIDATION */ + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17515,20 +17717,19 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to * be underfilled. */ -__hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { +__hot static int page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_BRANCH(mp)); + cASSERT(mc, IS_BRANCH(mp)); MDBX_node *node = page_node(mp, 0); - int rc; - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; - return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST); + return page_search_root(mc, NULL, MDBX_PS_FIRST); } /* Search for the page a given key should be in. @@ -17545,32 +17746,31 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { * lookups. * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { int rc; pgno_t root; /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { - mdbx_debug("%s", "transaction has failed, must abort"); + DEBUG("%s", "transaction has failed, must abort"); return MDBX_BAD_TXN; } /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbistate & DBI_STALE)) { - rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi); + rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } root = mc->mc_db->md_root; if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdbx_debug("%s", "tree is empty"); + DEBUG("%s", "tree is empty"); return MDBX_NOTFOUND; } - mdbx_cassert(mc, root >= NUM_METAS); + cASSERT(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { txnid_t pp_txnid = mc->mc_db->md_mod_txnid; pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid @@ -17582,63 +17782,81 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, if ((scan->mt_flags & MDBX_TXN_DIRTY) && (mc->mc_dbi == MAIN_DBI || (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { + /* После коммита вложенных тразакций может быть mod_txnid > front */ pp_txnid = scan->mt_front; break; } while (unlikely((scan = scan->mt_parent) != nullptr)); } - if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) + if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) return rc; } mc->mc_snum = 1; mc->mc_top = 0; - mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, - mc->mc_pg[0]->mp_flags); + DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); if (flags & MDBX_PS_MODIFY) { - if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = mdbx_touch_dbi(mc))) + if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc))) return rc; - if (unlikely(rc = mdbx_page_touch(mc))) + if (unlikely(rc = page_touch(mc))) return rc; } if (flags & MDBX_PS_ROOTONLY) return MDBX_SUCCESS; - return mdbx_page_search_root(mc, key, flags); + return page_search_root(mc, key, flags); } -/* Return the data associated with a given node. - * - * [in] mc The cursor for this operation. - * [in] leaf The node being read. - * [out] data Updated to point to the node's data. - * - * Returns 0 on success, non-zero on failure. */ -static __always_inline int mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *node, MDBX_val *data, - const txnid_t front) { - data->iov_len = node_ds(node); - data->iov_base = node_data(node); - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { - /* Read overflow data. */ - MDBX_page *omp; /* overflow page */ - int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, front); - if (unlikely((rc != MDBX_SUCCESS))) { - mdbx_debug("read overflow page %" PRIaPGNO " failed", - node_largedata_pgno(node)); - return rc; +/* Read large/overflow node data. */ +static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { + cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); + + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((lp.err != MDBX_SUCCESS))) { + DEBUG("read large/overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return lp.err; + } + + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + data->iov_base = page_data(lp.page); + if (!MDBX_DISABLE_VALIDATION) { + const MDBX_env *env = mc->mc_txn->mt_env; + const size_t dsize = data->iov_len; + if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + return bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else + poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } - data->iov_base = page_data(omp); } return MDBX_SUCCESS; } +/* Return the data associated with a given node. */ +static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (likely(node_flags(node) != F_BIGDATA)) + return MDBX_SUCCESS; + return node_read_bigdata(mc, node, data, mp); +} + int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -17651,11 +17869,11 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; + return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; } int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, @@ -17674,7 +17892,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, return MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17684,7 +17902,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -17697,11 +17915,11 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; + rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && values_count) *values_count = 0; @@ -17713,10 +17931,10 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (cx.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { // coverity[uninit_use : FALSE] - mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && - (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); + tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && + (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); // coverity[uninit_use : FALSE] *values_count = (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || @@ -17737,7 +17955,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * [in] dir SIBLING_LEFT or SIBLING_RIGHT. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { +static int cursor_sibling(MDBX_cursor *mc, int dir) { int rc; MDBX_node *node; MDBX_page *mp; @@ -17746,16 +17964,16 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ - mdbx_cursor_pop(mc); - mdbx_debug("parent page is page %" PRIaPGNO ", index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + cursor_pop(mc); + DEBUG("parent page is page %" PRIaPGNO ", index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) : (mc->mc_ki[mc->mc_top] == 0)) { - mdbx_debug("no more keys aside, moving to next %s sibling", - dir ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { + DEBUG("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; @@ -17764,32 +17982,31 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { } else { assert((dir - 1) == -1 || (dir - 1) == 1); mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); - mdbx_debug("just moving to %s index key %u", - (dir == SIBLING_RIGHT) ? "right" : "left", - mc->mc_ki[mc->mc_top]); + DEBUG("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); } - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) { + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); return rc; } - rc = mdbx_cursor_push(mc, mp); + rc = cursor_push(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = - (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); + (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0; return MDBX_SUCCESS; } /* Move the cursor to the next data item. */ -static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -17798,7 +18015,7 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return mdbx_cursor_first(mc, key, data); + return cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { @@ -17809,10 +18026,9 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_db->md_flags & MDBX_DUPSORT) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { - rc = - mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); + rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) get_key_optional(node, key); @@ -17826,8 +18042,8 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; goto skip; @@ -17837,32 +18053,30 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)++ki; const int numkeys = page_numkeys(mp); if (unlikely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != - MDBX_SUCCESS)) { + rc = cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(rc != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } skip: - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -17870,17 +18084,16 @@ skip: } node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17889,8 +18102,8 @@ skip: } /* Move the cursor to the previous data item. */ -static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -17899,7 +18112,7 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (unlikely(rc)) return rc; mc->mc_ki[mc->mc_top]++; @@ -17909,10 +18122,9 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & MDBX_DUPSORT) && mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_PREV || op == MDBX_PREV_DUP) { - rc = - mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); + rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) { get_key_optional(node, key); @@ -17928,8 +18140,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -17937,26 +18149,24 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)--ki; if (unlikely(ki < 0)) { mc->mc_ki[mc->mc_top] = 0; - mdbx_debug("%s", "=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) + DEBUG("%s", "=====> move to prev sibling page"); + if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) return rc; mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -17965,17 +18175,16 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17984,9 +18193,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Set the cursor on a specific data item. */ -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op) { +__hot static struct cursor_set_result +cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node = NULL; DKBUF_DEBUG; @@ -17995,7 +18203,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, ret.exact = false; if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); + cASSERT(mc, !"Invalid key-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -18005,7 +18213,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (aligned_key.iov_len) { default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -18030,7 +18238,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (mc->mc_flags & C_INITIALIZED) { MDBX_val nodekey; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; if (unlikely(!page_numkeys(mp))) { mc->mc_ki[mc->mc_top] = 0; @@ -18051,9 +18259,8 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, * was the one we wanted. */ mc->mc_ki[mc->mc_top] = 0; ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp > 0) { @@ -18068,12 +18275,12 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); if (cmp == 0) { /* last node was the one we wanted */ - mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); + cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp < 0) { @@ -18090,9 +18297,9 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (cmp == 0) { /* current node was the one we wanted */ ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } } @@ -18108,7 +18315,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, break; if (i == mc->mc_top) { /* There are no other pages */ - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; ret.err = MDBX_NOTFOUND; @@ -18121,9 +18328,8 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (op == MDBX_SET_RANGE) goto got_node; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); ret.err = MDBX_NOTFOUND; return ret; } @@ -18131,15 +18337,15 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, mc->mc_pg[0] = 0; } - ret.err = mdbx_page_search(mc, &aligned_key, 0); + ret.err = page_search(mc, &aligned_key, 0); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + cASSERT(mc, IS_LEAF(mp)); search_node:; - struct node_result nsr = mdbx_node_search(mc, &aligned_key); + struct node_result nsr = node_search(mc, &aligned_key); node = nsr.node; ret.exact = nsr.exact; if (!ret.exact) { @@ -18153,52 +18359,51 @@ search_node:; } if (node == NULL) { - mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); - ret.err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + DEBUG("%s", "===> inexact leaf not found, goto sibling"); + ret.err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(ret.err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return ret; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + cASSERT(mc, IS_LEAF(mp)); if (!IS_LEAF2(mp)) node = page_node(mp, 0); } } - mdbx_cassert(mc, - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); got_node: mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; - if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - ret.err = MDBX_CORRUPTED; - } else { - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } - ret.err = MDBX_SUCCESS; - } + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + ret.err = MDBX_CORRUPTED; return ret; } - if (F_ISSET(node_flags(node), F_DUPDATA)) { - ret.err = mdbx_xcursor_init1(mc, node, mp); + if (IS_LEAF2(mp)) { + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } + ret.err = MDBX_SUCCESS; + return ret; + } + + if (node_flags(node) & F_DUPDATA) { + ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - ret.err = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } else { - ret = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_GET_BOTH && !ret.exact) { @@ -18210,7 +18415,7 @@ got_node: if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); + cASSERT(mc, !"Invalid data-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -18219,7 +18424,7 @@ got_node: if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { switch (aligned_data.iov_len) { default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERDUP"); + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -18237,15 +18442,14 @@ got_node: } } MDBX_val actual_data; - ret.err = mdbx_node_read(mc, node, &actual_data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); if (cmp) { - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { ret.err = MDBX_NOTFOUND; return ret; @@ -18253,8 +18457,7 @@ got_node: } *data = actual_data; } else { - ret.err = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } @@ -18264,57 +18467,55 @@ got_node: if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) get_key_optional(node, key); - mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), + DVAL_DEBUG(data)); ret.err = MDBX_SUCCESS; return ret; } /* Move the cursor to the first item in the database. */ -static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + rc = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; mc->mc_ki[mc->mc_top] = 0; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + key->iov_base = page_leaf2key(mp, 0, key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + MDBX_node *node = page_node(mp, 0); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -18323,50 +18524,47 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } /* Move the cursor to the last item in the database. */ -static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + rc = page_search(mc, NULL, MDBX_PS_LAST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mc->mc_flags |= C_INITIALIZED | C_EOF; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], - mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -18374,8 +18572,8 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; } -int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +__hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -18392,37 +18590,37 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_ENODATA; - MDBX_page *mp = mc->mc_pg[mc->mc_top]; + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } const unsigned nkeys = page_numkeys(mp); if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); if (mc->mc_flags & C_EOF) return MDBX_ENODATA; mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); get_key_optional(node, key); if (data) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else { @@ -18432,7 +18630,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { - rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + rc = node_read(mc, node, data, mp); if (unlikely(rc)) return rc; } @@ -18453,12 +18651,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op).err; + rc = cursor_set(mc, key, data, op).err; if (mc->mc_flags & C_INITIALIZED) { - mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); } break; case MDBX_GET_MULTIPLE: @@ -18476,7 +18673,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_DUP); if (rc == MDBX_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDBX_cursor *mx; @@ -18498,11 +18695,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (rc == MDBX_SUCCESS) { MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { - rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); + rc = cursor_sibling(mx, SIBLING_LEFT); if (rc == MDBX_SUCCESS) goto fetchm; } else { @@ -18513,18 +18710,18 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_NEXT: case MDBX_NEXT_DUP: case MDBX_NEXT_NODUP: - rc = mdbx_cursor_next(mc, key, data, op); + rc = cursor_next(mc, key, data, op); break; case MDBX_PREV: case MDBX_PREV_DUP: case MDBX_PREV_NODUP: - rc = mdbx_cursor_prev(mc, key, data, op); + rc = cursor_prev(mc, key, data, op); break; case MDBX_FIRST: - rc = mdbx_cursor_first(mc, key, data); + rc = cursor_first(mc, key, data); break; case MDBX_FIRST_DUP: - mfunc = mdbx_cursor_first; + mfunc = cursor_first; move: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; @@ -18537,10 +18734,9 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(node_flags(node), F_DUPDATA)) { + if (!(node_flags(node) & F_DUPDATA)) { get_key_optional(node, key); - rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); break; } } @@ -18549,18 +18745,17 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; case MDBX_LAST: - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); break; case MDBX_LAST_DUP: - mfunc = mdbx_cursor_last; + mfunc = cursor_last; goto move; case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ case MDBX_SET_LOWERBOUND: { if (unlikely(key == NULL || data == NULL)) return MDBX_EINVAL; MDBX_val save_data = *data; - struct cursor_set_result csr = - mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE); + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { mc->mc_flags &= ~C_DEL; @@ -18571,18 +18766,18 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * returning MDBX_BAD_VALSIZE. */ } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { *data = save_data; - csr = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + csr = + cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_NOTFOUND) { - mdbx_cassert(mc, !csr.exact); - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + cASSERT(mc, !csr.exact); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } else { int cmp = mc->mc_dbx->md_dcmp(&save_data, data); csr.exact = (cmp == 0); if (cmp > 0) - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } if (rc == MDBX_SUCCESS && !csr.exact) @@ -18594,12 +18789,12 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = MDBX_SUCCESS; else if (rc == MDBX_SUCCESS) /* exactly match, going next */ - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT); + rc = cursor_next(mc, key, data, MDBX_NEXT); } break; } default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); + DEBUG("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; } @@ -18609,11 +18804,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - int err = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + int err = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(err != MDBX_SUCCESS)) return err; } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -18636,16 +18831,21 @@ static int cursor_next_batch(MDBX_cursor *mc) { mc->mc_ki[mc->mc_top] = (indx_t)++ki; const int numkeys = page_numkeys(mp); if (likely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - int err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + int err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return err; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } } return MDBX_SUCCESS; } @@ -18677,8 +18877,8 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; break; default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); - rc = EINVAL; + DEBUG("unhandled/unimplemented cursor operation %u", op); + rc = MDBX_EINVAL; break; } @@ -18687,15 +18887,20 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } - const MDBX_page *const page = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(page); + const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } + const unsigned nkeys = page_numkeys(mp); unsigned i = mc->mc_ki[mc->mc_top], n = 0; if (unlikely(i >= nkeys)) { - mdbx_cassert(mc, op == MDBX_GET_CURRENT); - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, op == MDBX_GET_CURRENT); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); *count = 0; if (mc->mc_flags & C_EOF) { - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); return MDBX_ENODATA; } if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) @@ -18704,15 +18909,14 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return MDBX_NOTFOUND; } - const txnid_t pp_txnid = pp_txnid4chk(page, mc->mc_txn); do { if (unlikely(n + 2 > limit)) { rc = MDBX_RESULT_TRUE; break; } - const MDBX_node *leaf = page_node(page, i); + const MDBX_node *leaf = page_node(mp, i); get_key(leaf, &pairs[n]); - rc = mdbx_node_read(mc, leaf, &pairs[n + 1], pp_txnid); + rc = node_read(mc, leaf, &pairs[n + 1], mp); if (unlikely(rc != MDBX_SUCCESS)) break; n += 2; @@ -18723,19 +18927,19 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } -static int mdbx_touch_dbi(MDBX_cursor *mc) { - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); +static int touch_dbi(MDBX_cursor *mc) { + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - mdbx_cassert(mc, (mc->mc_flags & C_RECLAIMING) == 0); + cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; - rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); + rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -18745,25 +18949,25 @@ static int mdbx_touch_dbi(MDBX_cursor *mc) { /* Touch all the pages in the cursor stack. Set mc_top. * Makes sure all the pages are writable, before attempting a write operation. * [in] mc The cursor to operate on. */ -static int mdbx_cursor_touch(MDBX_cursor *mc) { +static int cursor_touch(MDBX_cursor *mc) { int rc = MDBX_SUCCESS; if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - rc = mdbx_touch_dbi(mc); + rc = touch_dbi(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (likely(mc->mc_snum)) { mc->mc_top = 0; do { - rc = mdbx_page_touch(mc); + rc = page_touch(mc); } while (!rc && ++(mc->mc_top) < mc->mc_snum); mc->mc_top = mc->mc_snum - 1; } return rc; } -int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - unsigned flags) { +__hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { MDBX_env *env; MDBX_page *sub_root = NULL; MDBX_val xdata, *rdata, dkey, olddata; @@ -18782,10 +18986,10 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; - mdbx_cassert(mc, cursor_is_tracked(mc)); + cASSERT(mc, cursor_is_tracked(mc)); env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ @@ -18793,7 +18997,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(flags & MDBX_MULTIPLE)) { if (unlikely(flags & MDBX_RESERVE)) return MDBX_EINVAL; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; dcount = data[1].iov_len; if (unlikely(dcount < 2 || data->iov_len == 0)) @@ -18828,19 +19032,19 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (likely((mc->mc_flags & C_SUB) == 0)) { if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); + cASSERT(mc, !"Invalid key-size"); return MDBX_BAD_VALSIZE; } if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); + cASSERT(mc, !"Invalid data-size"); return MDBX_BAD_VALSIZE; } if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (key->iov_len) { default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); return MDBX_BAD_VALSIZE; case 4: if (unlikely(3 & (uintptr_t)key->iov_base)) { @@ -18863,7 +19067,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { switch (data->iov_len) { default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); return MDBX_BAD_VALSIZE; case 4: if (unlikely(3 & (uintptr_t)data->iov_base)) { @@ -18889,10 +19093,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } } - mdbx_debug( - "==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY_DEBUG(key), key->iov_len, - DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); + DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, + DDBI(mc), DKEY_DEBUG(key), key->iov_len, + DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { @@ -18913,12 +19116,11 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely((flags & MDBX_MULTIPLE))) goto drop_current; - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, - mc->mc_xcursor != NULL && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + if (node_flags(node) & F_DUPDATA) { + cASSERT(mc, mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); /* Если за ключом более одного значения, либо если размер данных * отличается, то вместо обновления требуется удаление и * последующая вставка. */ @@ -18954,7 +19156,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else if ((flags & MDBX_CURRENT) == 0) { bool exact = false; if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = mdbx_cursor_last(mc, &dkey, &olddata); + rc = cursor_last(mc, &dkey, &olddata); if (likely(rc == MDBX_SUCCESS)) { rc = mc->mc_dbx->md_cmp(key, &dkey); if (likely(rc > 0)) { @@ -18971,25 +19173,25 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { struct cursor_set_result csr = /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - mdbx_cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); rc = csr.err; exact = csr.exact; } if (likely(rc == MDBX_SUCCESS)) { if (exact) { if (unlikely(flags & MDBX_NOOVERWRITE)) { - mdbx_debug("duplicate key [%s]", DKEY_DEBUG(key)); + DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); *data = olddata; return MDBX_KEYEXIST; } if (unlikely(mc->mc_flags & C_SUB)) { /* nested subtree of DUPSORT-database with the same key, * nothing to update */ - mdbx_assert(env, data->iov_len == 0 && - (olddata.iov_len == 0 || - /* olddata may not be updated in case LEAF2-page - of dupfixed-subDB */ - (mc->mc_db->md_flags & MDBX_DUPFIXED))); + eASSERT(env, data->iov_len == 0 && + (olddata.iov_len == 0 || + /* olddata may not be updated in case LEAF2-page + of dupfixed-subDB */ + (mc->mc_db->md_flags & MDBX_DUPFIXED))); return MDBX_SUCCESS; } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && @@ -19033,22 +19235,22 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, rdata = &xdata; xdata.iov_len = data->iov_len * dcount; } - if (unlikely(err = mdbx_cursor_spill(mc, key, rdata))) + if (unlikely(err = cursor_spill(mc, key, rdata))) return err; } if (unlikely(rc == MDBX_NO_ROOT)) { /* new database, write a root leaf page */ - mdbx_debug("%s", "allocating new root leaf page"); + DEBUG("%s", "allocating new root leaf page"); if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - err = mdbx_touch_dbi(mc); + err = touch_dbi(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } - struct page_result npr = mdbx_page_new(mc, P_LEAF, 1); + pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; - npr.err = mdbx_cursor_push(mc, npr.page); + npr.err = cursor_push(mc, npr.page); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; mc->mc_db->md_root = npr.page->mp_pgno; @@ -19073,7 +19275,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_flags |= C_INITIALIZED; } else { /* make sure all cursor pages are writable */ - err = mdbx_cursor_touch(mc); + err = cursor_touch(mc); if (unlikely(err)) return err; } @@ -19085,7 +19287,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_txnid = mc->mc_txn->mt_front; if (insert_key) { /* The key does not exist */ - mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && node_size(key, data) > env->me_leaf_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty @@ -19118,15 +19320,15 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } err = MDBX_SUCCESS; if (mc->mc_ki[mc->mc_top]) - err = mdbx_update_key(mc, key); - mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); - mc->mc_top += (uint16_t)dtop; + err = update_key(mc, key); + cASSERT(mc, mc->mc_top + dtop < UINT16_MAX); + mc->mc_top += (uint8_t)dtop; if (unlikely(err != MDBX_SUCCESS)) return err; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -19134,95 +19336,92 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } more:; - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* Large/Overflow page overwrites need special handling */ - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { + if (unlikely(node_flags(node) & F_BIGDATA)) { int dpages = (node_size(key, data) > env->me_leaf_nodemax) ? number_of_ovpages(env, data->iov_len) : 0; const pgno_t pgno = node_largedata_pgno(node); - struct page_result pgr = mdbx_page_get_ex( - mc, pgno, pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); - if (unlikely(pgr.err != MDBX_SUCCESS)) - return pgr.err; - if (unlikely(!IS_OVERFLOW(pgr.page))) - return MDBX_CORRUPTED; + pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ - int ovpages = pgr.page->mp_pages; - if (!IS_FROZEN(mc->mc_txn, pgr.page) && + int ovpages = lp.page->mp_pages; + if (!IS_FROZEN(mc->mc_txn, lp.page) && (unlikely(mc->mc_flags & C_GCFREEZE) ? (ovpages >= dpages) : (ovpages == /* LY: add configurable threshold to keep reserve space */ dpages))) { /* yes, overwrite it. */ - if (!IS_MODIFIABLE(mc->mc_txn, pgr.page)) { - if (IS_SPILLED(mc->mc_txn, pgr.page)) { - pgr = /* TODO: avoid search and get txn & spill-index from + if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { + if (IS_SPILLED(mc->mc_txn, lp.page)) { + lp = /* TODO: avoid search and get txn & spill-index from page_result */ - mdbx_page_unspill(mc->mc_txn, pgr.page); - if (unlikely(pgr.err)) - return pgr.err; + page_unspill(mc->mc_txn, lp.page); + if (unlikely(lp.err)) + return lp.err; } else { if (unlikely(!mc->mc_txn->mt_parent)) { - mdbx_error( - "Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - "overflow/large", pgno, pgr.page->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "overflow/large", pgno, lp.page->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); return MDBX_PROBLEM; } /* It is writable only in a parent txn */ - MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDBX_page *np = page_malloc(mc->mc_txn, ovpages); if (unlikely(!np)) return MDBX_ENOMEM; - memcpy(np, pgr.page, PAGEHDRSZ); /* Copy header of page */ - err = mdbx_page_dirty(mc->mc_txn, pgr.page = np, ovpages); + memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ + err = page_dirty(mc->mc_txn, lp.page = np, ovpages); if (unlikely(err != MDBX_SUCCESS)) return err; #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn)); + cASSERT(mc, dirtylist_check(mc->mc_txn)); } } node_set_ds(node, data->iov_len); - if (F_ISSET(flags, MDBX_RESERVE)) - data->iov_base = page_data(pgr.page); + if (flags & MDBX_RESERVE) + data->iov_base = page_data(lp.page); else - memcpy(page_data(pgr.page), data->iov_base, data->iov_len); + memcpy(page_data(lp.page), data->iov_base, data->iov_len); - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } return MDBX_SUCCESS; } - if ((err = mdbx_page_retire(mc, pgr.page)) != MDBX_SUCCESS) + if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { olddata.iov_len = node_ds(node); olddata.iov_base = node_data(node); - mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <= - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <= + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. * mp: new (sub-)page. offset: growth in page size. @@ -19233,7 +19432,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ - if (!F_ISSET(node_flags(node), F_DUPDATA)) { + if (!(node_flags(node) & F_DUPDATA)) { /* does data match? */ const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); @@ -19254,13 +19453,13 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, * considers them equal. So continue update since called without. * Continue to update since was called without MDBX_NODUPDATA. */ } - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); goto current; } /* Just overwrite the current item */ if (flags & MDBX_CURRENT) { - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); goto current; } @@ -19277,11 +19476,11 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } else { xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.iov_len & 1) + (data->iov_len & 1); - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ @@ -19335,12 +19534,12 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, nested_dupdb.md_entries = page_numkeys(fp); xdata.iov_len = sizeof(nested_dupdb); xdata.iov_base = &nested_dupdb; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const pgr_t par = page_alloc(mc); mp = par.page; if (unlikely(par.err != MDBX_SUCCESS)) return par.err; mc->mc_db->md_leaf_pages += 1; - mdbx_cassert(mc, env->me_psize > olddata.iov_len); + cASSERT(mc, env->me_psize > olddata.iov_len); offset = env->me_psize - (unsigned)olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; nested_dupdb.md_root = mp->mp_pgno; @@ -19353,7 +19552,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mp->mp_txnid = mc->mc_txn->mt_front; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); mp->mp_upper = (indx_t)(fp->mp_upper + offset); if (unlikely(fp_flags & P_LEAF2)) { memcpy(page_data(mp), page_data(fp), @@ -19365,7 +19564,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); for (i = 0; i < page_numkeys(fp); i++) { - mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); + cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); mp->mp_ptrs[i] += (indx_t)offset; } } @@ -19375,7 +19574,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, flags |= F_DUPDATA; do_sub = true; if (!insert_key) - mdbx_node_del(mc, 0); + node_del(mc, 0); goto new_sub; } @@ -19385,58 +19584,57 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, current: if (data->iov_len == olddata.iov_len) { - mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); + cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ - if (F_ISSET(flags, MDBX_RESERVE)) + if (flags & MDBX_RESERVE) data->iov_base = olddata.iov_base; else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { - mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); - mdbx_cassert(mc, node_ds(node) == 0); - mdbx_cassert(mc, node_flags(node) == 0); - mdbx_cassert(mc, key->iov_len < UINT16_MAX); + cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); + cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); + cASSERT(mc, node_ds(node) == 0); + cASSERT(mc, node_flags(node) == 0); + cASSERT(mc, key->iov_len < UINT16_MAX); node_set_ks(node, key->iov_len); memcpy(node_key(node), key->iov_base, key->iov_len); - mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, (char *)node_key(node) + node_ds(node) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } return MDBX_SUCCESS; } } - mdbx_node_del(mc, 0); + node_del(mc, 0); } rdata = data; new_sub:; - unsigned nflags = flags & NODE_ADD_FLAGS; + const unsigned naf = flags & NODE_ADD_FLAGS; size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len : leaf_size(env, key, rdata); if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - if (!insert_key) - nflags |= MDBX_SPLIT_REPLACE; - rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + rc = page_split(mc, key, rdata, P_INVALID, + insert_key ? naf : naf | MDBX_SPLIT_REPLACE); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0 && - rdata->iov_len == 0); - rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); + cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && + rdata->iov_len == 0); + rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); + rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; @@ -19477,7 +19675,7 @@ new_sub:; SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; - err = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -19506,7 +19704,7 @@ new_sub:; continue; if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { - err = mdbx_xcursor_init2(m2, mx, dupdata_flag); + err = cursor_xinit2(m2, mx, dupdata_flag); if (unlikely(err != MDBX_SUCCESS)) return err; } else if (!insert_key && m2->mc_ki[i] < nkeys) { @@ -19515,7 +19713,7 @@ new_sub:; } } } - mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); + cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == @@ -19553,13 +19751,13 @@ new_sub:; } } } - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = cursor_check(mc); return rc; bad_sub: if (unlikely(rc == MDBX_KEYEXIST)) { /* should not happen, we deleted that item */ - mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc); + ERROR("Unexpected %i error while put to nested dupsort's hive", rc); rc = MDBX_PROBLEM; } } @@ -19567,7 +19765,7 @@ new_sub:; return rc; } -int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { +__hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -19579,7 +19777,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -19589,33 +19787,30 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_NOTFOUND; if (likely((flags & MDBX_NOSPILL) == 0) && - unlikely(rc = mdbx_cursor_spill(mc, NULL, NULL))) + unlikely(rc = cursor_spill(mc, NULL, NULL))) return rc; - rc = mdbx_cursor_touch(mc); + rc = cursor_touch(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; - if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } - goto del_key; } + if (IS_LEAF2(mp)) + goto del_key; MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* mdbx_cursor_del0() will subtract the final entry */ + /* cursor_del() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) + if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) @@ -19630,7 +19825,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } else { MDBX_cursor *m2; /* shrink fake page */ - mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + node_shrink(mp, mc->mc_ki[mc->mc_top]); node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ @@ -19653,8 +19848,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } } mc->mc_db->md_entries--; - mdbx_cassert(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && - mc->mc_db->md_root != P_INVALID); + cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && + mc->mc_db->md_root != P_INVALID); return rc; } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; @@ -19664,7 +19859,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (node_flags(node) & F_SUBDATA) { /* add all the child DB's pages to the free list */ - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc)) goto fail; } @@ -19673,17 +19868,15 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) return MDBX_INCOMPATIBLE; - /* add overflow pages to free list */ - if (F_ISSET(node_flags(node), F_BIGDATA)) { - MDBX_page *omp; - if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, - pp_txnid4chk(mp, mc->mc_txn))) || - (rc = mdbx_page_retire(mc, omp)))) + /* add large/overflow pages to free list */ + if (node_flags(node) & F_BIGDATA) { + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) goto fail; } del_key: - return mdbx_cursor_del0(mc); + return cursor_del(mc); fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -19691,66 +19884,74 @@ fail: } /* Allocate and initialize new pages for a database. - * Set MDBX_TXN_ERROR on failure. - * - * [in] mc a cursor on the database being added to. - * [in] flags flags defining what type of page is being allocated. - * [in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * [out] mp Address of a page, or NULL on failure. - * - * Returns 0 on success, non-zero on failure. */ -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages) { - struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL); + * Set MDBX_TXN_ERROR on failure. */ +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { + cASSERT(mc, (flags & P_OVERFLOW) == 0); + pgr_t ret = page_alloc(mc); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi, - ret.page->mp_pgno, npages); + DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; ret.page->mp_txnid = mc->mc_txn->mt_front; - mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); - mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + STATIC_ASSERT(P_BRANCH == 1); + const unsigned is_branch = flags & P_BRANCH; + + ret.page->mp_lower = 0; + ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); + mc->mc_db->md_branch_pages += is_branch; + mc->mc_db->md_leaf_pages += 1 - is_branch; + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = outer_db(mc); + outer->md_branch_pages += is_branch; + outer->md_leaf_pages += 1 - is_branch; + } + return ret; +} + +static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { + pgr_t ret = likely(npages == 1) + ? page_alloc(mc) + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + + DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, + ret.page->mp_pgno, npages); + ret.page->mp_flags = P_OVERFLOW; + ret.page->mp_txnid = mc->mc_txn->mt_front; + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (likely((flags & P_OVERFLOW) == 0)) { - STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = flags & P_BRANCH; - ret.page->mp_lower = 0; - ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); - mc->mc_db->md_branch_pages += is_branch; - mc->mc_db->md_leaf_pages += 1 - is_branch; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - outer->md_branch_pages += is_branch; - outer->md_leaf_pages += 1 - is_branch; - } - } else { - mc->mc_db->md_overflow_pages += npages; - ret.page->mp_pages = npages; - mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); - } - + mc->mc_db->md_overflow_pages += npages; + ret.page->mp_pages = npages; + cASSERT(mc, !(mc->mc_flags & C_SUB)); return ret; } -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key) { +__hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, " - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to leaf2-%spage %" PRIaPGNO " index %i, " + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, + DKEY_DEBUG(key)); - mdbx_cassert(mc, key); - mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2)); + cASSERT(mc, key); + cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); const unsigned ksize = mc->mc_db->md_xsize; - mdbx_cassert(mc, ksize == key->iov_len); + cASSERT(mc, ksize == key->iov_len); const unsigned nkeys = page_numkeys(mp); /* Just using these for counting */ @@ -19764,7 +19965,7 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, mp->mp_upper = (indx_t)upper; char *const ptr = page_leaf2key(mp, indx, ksize); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); const unsigned diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ @@ -19774,23 +19975,22 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno) { +static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + pgno_t pgno) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, + key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH); + cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); STATIC_ASSERT(NODESIZE % 2 == 0); /* Move higher pointers up one slot. */ const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; @@ -19818,60 +20018,60 @@ static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags) { +__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, key != NULL && data != NULL); - mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF); - mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); + DEBUG("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + cASSERT(mc, key != NULL && data != NULL); + cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); MDBX_page *largepage = NULL; size_t node_bytes; if (unlikely(flags & F_BIGDATA)) { - /* Data already on overflow page. */ + /* Data already on large/overflow page. */ STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { - /* Put data on overflow page. */ + /* Put data on large/overflow page. */ if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", - "dupsort-db", mc->mc_db->md_flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", + mc->mc_db->md_flags); return MDBX_PROBLEM; } if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node", - flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "node", + flags); return MDBX_PROBLEM; } const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); - const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages); + const pgr_t npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; - mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR - " data bytes", - largepage->mp_pages, largepage->mp_pgno, data->iov_len); + DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->mp_pages, largepage->mp_pgno, data->iov_len); flags |= F_BIGDATA; node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); } else { node_bytes = node_size(key, data) + sizeof(indx_t); } - mdbx_cassert(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; @@ -19895,22 +20095,19 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, void *nodedata = node_data(node); if (likely(largepage == NULL)) { - if (unlikely(flags & F_BIGDATA)) + if (unlikely(flags & F_BIGDATA)) { memcpy(nodedata, data->iov_base, sizeof(pgno_t)); - else if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); + return MDBX_SUCCESS; + } } else { poke_pgno(nodedata, largepage->mp_pgno); nodedata = page_data(largepage); - if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); } + if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(nodedata != data->iov_base && + data->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(nodedata, data->iov_base, data->iov_len); return MDBX_SUCCESS; } @@ -19918,75 +20115,65 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, * [in] mc Cursor pointing to the node to delete. * [in] ksize The size of a node. Only used if the page is * part of a MDBX_DUPFIXED database. */ -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { +__hot static void node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - int indx = mc->mc_ki[mc->mc_top]; - int i, j, nkeys, ptr; - MDBX_node *node; - char *base; + const unsigned hole = mc->mc_ki[mc->mc_top]; + const unsigned nkeys = page_numkeys(mp); - mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx, - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, indx < nkeys); + DEBUG("delete node %u on %s page %" PRIaPGNO, hole, + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); + cASSERT(mc, hole < nkeys); if (IS_LEAF2(mp)) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); - unsigned diff = nkeys - 1 - indx; - base = page_leaf2key(mp, indx, ksize); + cASSERT(mc, ksize >= sizeof(indx_t)); + unsigned diff = nkeys - 1 - hole; + char *base = page_leaf2key(mp, hole, ksize); if (diff) memmove(base, base + ksize, diff * ksize); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, - (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); return; } - node = page_node(mp, indx); - mdbx_cassert(mc, !IS_BRANCH(mp) || indx || node_ks(node) == 0); - size_t sz = NODESIZE + node_ks(node); - if (IS_LEAF(mp)) { - if (F_ISSET(node_flags(node), F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += node_ds(node); - } - sz = EVEN(sz); + MDBX_node *node = page_node(mp, hole); + cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); + size_t hole_size = NODESIZE + node_ks(node); + if (IS_LEAF(mp)) + hole_size += + (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node); + hole_size = EVEN(hole_size); - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < nkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) { - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz); - mp->mp_ptrs[j] += (indx_t)sz; - } - j++; - } - } + const indx_t hole_offset = mp->mp_ptrs[hole]; + unsigned r, w; + for (r = w = 0; r < nkeys; r++) + if (r != hole) + mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) + ? mp->mp_ptrs[r] + (indx_t)hole_size + : mp->mp_ptrs[r]; - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + sz, base, ptr - mp->mp_upper); + char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + memmove(base + hole_size, base, hole_offset - mp->mp_upper); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); - mp->mp_upper += (indx_t)sz; + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); + mp->mp_upper += (indx_t)hole_size; -#if MDBX_DEBUG > 0 - if (mdbx_audit_enabled()) { - int page_check_err = mdbx_page_check(mc, mp, C_UPDATING); - mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); + if (AUDIT_ENABLED()) { + const uint8_t checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int page_check_err = page_check(mc, mp); + mc->mc_checking = checking; + cASSERT(mc, page_check_err == MDBX_SUCCESS); } -#endif } /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { +static void node_shrink(MDBX_page *mp, unsigned indx) { MDBX_node *node; MDBX_page *sp, *xp; char *base; @@ -20044,11 +20231,11 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { * depend only on the parent DB. * * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ -static int mdbx_xcursor_init0(MDBX_cursor *mc) { +static int cursor_xinit0(MDBX_cursor *mc) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } @@ -20061,7 +20248,11 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { mx->mx_cursor.mc_dbistate = mc->mc_dbistate; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; + STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); + cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); + mx->mx_cursor.mc_checking = + mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); mx->mx_dbx.md_name.iov_len = 0; mx->mx_dbx.md_name.iov_base = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; @@ -20076,43 +20267,42 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, - const MDBX_page *mp) { +static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } const uint8_t flags = node_flags(node); switch (flags) { default: - mdbx_error("invalid node flags %u", flags); + ERROR("invalid node flags %u", flags); return MDBX_CORRUPTED; case F_DUPDATA | F_SUBDATA: - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { - mdbx_error("invalid nested-db record size %zu", node_ds(node)); + ERROR("invalid nested-db record size %zu", node_ds(node)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); const txnid_t pp_txnid = mp->mp_txnid; - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { - mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN - ")", - mx->mx_db.md_mod_txnid, pp_txnid); + ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + mx->mx_db.md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; break; case F_DUPDATA: - if (!MDBX_DISABLE_PAGECHECKS && unlikely(node_ds(node) <= PAGEHDRSZ)) { - mdbx_error("invalid nested-page size %zu", node_ds(node)); + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { + ERROR("invalid nested-page size %zu", node_ds(node)); return MDBX_CORRUPTED; } MDBX_page *fp = node_data(node); @@ -20125,8 +20315,7 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_db.md_mod_txnid = mp->mp_txnid; mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = - C_INITIALIZED | C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); @@ -20136,23 +20325,22 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, } if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mc->mc_db->md_xsize != 0)) { - mdbx_error("cursor mismatched nested-db md_xsize %u", - mc->mc_db->md_xsize); + if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { + ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { - mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags); + ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { - mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length " - "(%zu/%zu)", - mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); + ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length " + "(%zu/%zu)", + mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); return MDBX_CORRUPTED; } mc->mc_db->md_xsize = mx->mx_db.md_xsize; @@ -20161,8 +20349,8 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); return MDBX_SUCCESS; } @@ -20173,19 +20361,19 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. * [in] src_mx The xcursor of an up-to-date cursor. * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ -static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, - bool new_dupdata) { +static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, + bool new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } if (new_dupdata) { mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; } @@ -20195,16 +20383,15 @@ static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; if (mx->mx_cursor.mc_flags & C_INITIALIZED) { - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); } return MDBX_SUCCESS; } -static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, - const MDBX_dbi dbi, MDBX_txn *const txn, - MDBX_db *const db, MDBX_dbx *const dbx, - uint8_t *const dbstate) { +static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi, + MDBX_txn *const txn, MDBX_db *const db, + MDBX_dbx *const dbx, uint8_t *const dbstate) { couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; @@ -20217,22 +20404,28 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, couple->outer.mc_top = 0; couple->outer.mc_pg[0] = 0; couple->outer.mc_flags = 0; + STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && + CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); + couple->outer.mc_checking = + (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION)) + ? CC_PAGECHECK | CC_LEAF + : CC_LEAF; couple->outer.mc_ki[0] = 0; couple->outer.mc_xcursor = NULL; int rc = MDBX_SUCCESS; if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { - rc = mdbx_page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); + rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { - rc = mdbx_setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, - txn->mt_env->me_psize); + rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, + txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; couple->outer.mc_xcursor = &couple->inner; - rc = mdbx_xcursor_init0(&couple->outer); + rc = cursor_xinit0(&couple->outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; @@ -20242,15 +20435,15 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, } /* Initialize a cursor for a given transaction and database. */ -static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { +static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], - &txn->mt_dbistate[dbi]); + return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], + &txn->mt_dbistate[dbi]); } MDBX_cursor *mdbx_cursor_create(void *context) { - MDBX_cursor_couple *couple = mdbx_calloc(1, sizeof(MDBX_cursor_couple)); + MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple)); if (unlikely(!couple)) return nullptr; @@ -20300,11 +20493,11 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) return MDBX_BAD_DBI; - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EACCESS; if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { - mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); if (unlikely(mc->mc_dbi != dbi || /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || mc->mc_txn != txn)) @@ -20324,16 +20517,16 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (mc->mc_signature == MDBX_MC_LIVE) { if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - mdbx_error("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); return MDBX_PROBLEM; } if (mc->mc_flags & C_UNTRACK) { MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_cassert(mc, *prev == mc); + cASSERT(mc, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = MDBX_MC_READY4CLOSE; @@ -20344,9 +20537,9 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { mc->mc_dbx = NULL; mc->mc_dbistate = NULL; } - mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); - rc = mdbx_cursor_init(mc, txn, dbi); + rc = cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20418,27 +20611,27 @@ again: void mdbx_cursor_close(MDBX_cursor *mc) { if (likely(mc)) { - mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE || - mc->mc_signature == MDBX_MC_READY4CLOSE); + ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE || + mc->mc_signature == MDBX_MC_READY4CLOSE); MDBX_txn *const txn = mc->mc_txn; if (!mc->mc_backup) { mc->mc_txn = NULL; /* Unlink from txn, if tracked. */ if (mc->mc_flags & C_UNTRACK) { - mdbx_ensure(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); + ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_tassert(txn, *prev == mc); + tASSERT(txn, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = 0; mc->mc_next = mc; - mdbx_free(mc); + osal_free(mc); } else { /* Cursor closed before nested txn ends */ - mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE); - mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE); + ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); mc->mc_signature = MDBX_MC_WAIT4EOT; } } @@ -20491,9 +20684,9 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { *countp = 1; if (mc->mc_xcursor != NULL) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); + if (node_flags(node) & F_DUPDATA) { + cASSERT(mc, mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) ? PTRDIFF_MAX : (size_t)mc->mc_xcursor->mx_db.md_entries; @@ -20507,7 +20700,7 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { * [in] mc Cursor pointing to the node to operate on. * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ -static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { +static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp; MDBX_node *node; char *base; @@ -20516,7 +20709,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { int ptr, i, nkeys, indx; DKBUF_DEBUG; - mdbx_cassert(mc, cursor_is_tracked(mc)); + cASSERT(mc, cursor_is_tracked(mc)); indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node = page_node(mp, indx); @@ -20525,8 +20718,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_val k2; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); + DEBUG("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, ptr, + DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); #endif /* MDBX_DEBUG */ /* Sizes must be 2-byte aligned. */ @@ -20538,19 +20731,19 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { if (delta) { if (delta > (int)page_room(mp)) { /* not enough space left, do a delete and split */ - mdbx_debug("Not enough room, delta = %zd, splitting...", delta); + DEBUG("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); - mdbx_node_del(mc, 0); - int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); - return rc; + node_del(mc, 0); + int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); + if (err == MDBX_SUCCESS && AUDIT_ENABLED()) + err = cursor_check_updating(mc); + return err; } nkeys = page_numkeys(mp); for (i = 0; i < nkeys; i++) { if (mp->mp_ptrs[i] <= ptr) { - mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); + cASSERT(mc, mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] -= (indx_t)delta; } } @@ -20558,7 +20751,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { base = (char *)mp + mp->mp_upper + PAGEHDRSZ; len = ptr - mp->mp_upper + NODESIZE; memmove(base - delta, base, len); - mdbx_cassert(mc, mp->mp_upper >= delta); + cASSERT(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; node = page_node(mp, indx); @@ -20573,41 +20766,41 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { } /* Move a node from csrc to cdst. */ -static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { +static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { int rc; DKBUF_DEBUG; MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); - mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); - if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); + if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { bailout: - mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", - PAGETYPE(psrc), PAGETYPE(pdst)); + ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", + PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } MDBX_val key4move; - switch (PAGETYPE(psrc)) { + switch (PAGETYPE_WHOLE(psrc)) { case P_BRANCH: { const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); - mdbx_cassert(csrc, node_flags(srcnode) == 0); + cASSERT(csrc, node_flags(srcnode) == 0); const pgno_t srcpg = node_pgno(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); if (csrc->mc_ki[csrc->mc_top] == 0) { const unsigned snum = csrc->mc_snum; - mdbx_cassert(csrc, snum > 0); + cASSERT(csrc, snum > 0); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(csrc); + rc = page_search_lowest(csrc); MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; if (unlikely(rc)) return rc; - mdbx_cassert(csrc, IS_LEAF(lowest_page)); + cASSERT(csrc, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; if (IS_LEAF2(lowest_page)) { @@ -20620,28 +20813,28 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - csrc->mc_snum = snum; - csrc->mc_top = snum - 1; + csrc->mc_snum = (uint8_t)snum; + csrc->mc_top = (uint8_t)snum - 1; csrc->mc_ki[csrc->mc_top] = 0; /* paranoia */ - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(csrc, IS_BRANCH(psrc)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(csrc, IS_BRANCH(psrc)); if (unlikely(!IS_BRANCH(psrc))) goto bailout; } if (cdst->mc_ki[cdst->mc_top] == 0) { const unsigned snum = cdst->mc_snum; - mdbx_cassert(csrc, snum > 0); + cASSERT(csrc, snum > 0); MDBX_cursor mn; cursor_copy(cdst, &mn); /* must find the lowest key below dst */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; - mdbx_cassert(cdst, IS_LEAF(lowest_page)); + cASSERT(cdst, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; MDBX_val key; @@ -20655,8 +20848,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - mn.mc_snum = snum; - mn.mc_top = snum - 1; + mn.mc_snum = (uint8_t)snum; + mn.mc_top = (uint8_t)snum - 1; mn.mc_ki[mn.mc_top] = 0; const intptr_t delta = @@ -20667,13 +20860,12 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc)) return rc; } else { @@ -20682,25 +20874,23 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; } - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = - mdbx_node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); + rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); } break; case P_LEAF: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; @@ -20710,33 +20900,34 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { data.iov_base = node_data(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, - node_flags(srcnode)); + rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, + node_flags(srcnode)); } break; case P_LEAF | P_LEAF2: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; key4move.iov_len = csrc->mc_db->md_xsize; key4move.iov_base = page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); + rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); } break; default: + assert(false); goto bailout; } @@ -20744,17 +20935,17 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { return rc; /* Delete the node from the source page. */ - mdbx_node_del(csrc, key4move.iov_len); + node_del(csrc, key4move.iov_len); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const MDBX_dbi dbi = csrc->mc_dbi; - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); if (fromleft) { /* If we're adding on the left, bump others up */ for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -20769,7 +20960,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]++; } if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) @@ -20787,7 +20978,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (!m3->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]--; } else { m3->mc_ki[csrc->mc_top]--; @@ -20802,7 +20993,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { /* Update the parent separators. */ if (csrc->mc_ki[csrc->mc_top] == 0) { - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); if (csrc->mc_ki[csrc->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(psrc)) { @@ -20813,15 +21004,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", - psrc->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for source page %" PRIaPGNO " to [%s]", + psrc->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(csrc, &mn); - mdbx_cassert(csrc, mn.mc_snum > 0); + cASSERT(csrc, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -20829,14 +21020,14 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = csrc->mc_ki[csrc->mc_top]; csrc->mc_ki[csrc->mc_top] = 0; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; - mdbx_cassert(csrc, rc == MDBX_SUCCESS); + cASSERT(csrc, rc == MDBX_SUCCESS); } } if (cdst->mc_ki[cdst->mc_top] == 0) { - mdbx_cassert(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_top > 0); if (cdst->mc_ki[cdst->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(pdst)) { @@ -20847,15 +21038,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", - pdst->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", + pdst->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(cdst, &mn); - mdbx_cassert(cdst, mn.mc_snum > 0); + cASSERT(cdst, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -20863,9 +21054,9 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = cdst->mc_ki[cdst->mc_top]; cdst->mc_ki[cdst->mc_top] = 0; - rc = mdbx_update_key(cdst, &nullkey); + rc = update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - mdbx_cassert(cdst, rc == MDBX_SUCCESS); + cASSERT(cdst, rc == MDBX_SUCCESS); } } @@ -20881,46 +21072,45 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { * [in] cdst Cursor pointing to the destination page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { +static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val key; int rc; - mdbx_cassert(csrc, csrc != cdst); - mdbx_cassert(csrc, cursor_is_tracked(csrc)); - mdbx_cassert(cdst, cursor_is_tracked(cdst)); + cASSERT(csrc, csrc != cdst); + cASSERT(csrc, cursor_is_tracked(csrc)); + cASSERT(cdst, cursor_is_tracked(cdst)); const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, - pdst->mp_pgno); + DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, + pdst->mp_pgno); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); - mdbx_cassert(csrc, - csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); - mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdbx_cassert(cdst, cdst->mc_snum > 1); - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || - IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); - const int pagetype = PAGETYPE(psrc); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); + cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */ + cASSERT(cdst, cdst->mc_snum > 1); + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || + IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); + const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ const unsigned dst_nkeys = page_numkeys(pdst); const unsigned src_nkeys = page_numkeys(psrc); - mdbx_cassert(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); + cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); if (likely(src_nkeys)) { unsigned j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; key.iov_len = csrc->mc_db->md_xsize; key.iov_base = page_data(psrc); unsigned i = 0; do { - rc = mdbx_node_add_leaf2(cdst, j++, &key); + rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = (char *)key.iov_base + key.iov_len; @@ -20933,23 +21123,23 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_cursor mn; cursor_copy(csrc, &mn); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; const MDBX_page *mp = mn.mc_pg[mn.mc_top]; if (likely(!IS_LEAF2(mp))) { - mdbx_cassert(&mn, IS_LEAF(mp)); + cASSERT(&mn, IS_LEAF(mp)); const MDBX_node *lowest = page_node(mp, 0); key.iov_len = node_ks(lowest); key.iov_base = node_key(lowest); } else { - mdbx_cassert(&mn, mn.mc_top > csrc->mc_top); + cASSERT(&mn, mn.mc_top > csrc->mc_top); key.iov_len = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); } - mdbx_cassert(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); - mdbx_cassert(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); + cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); + cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); const size_t dst_room = page_room(pdst); const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); @@ -20959,7 +21149,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; unsigned i = 0; @@ -20968,10 +21158,10 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val data; data.iov_len = node_ds(srcnode); data.iov_base = node_data(srcnode); - rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); + rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); } else { - mdbx_cassert(csrc, node_flags(srcnode) == 0); - rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode)); + cASSERT(csrc, node_flags(srcnode) == 0); + rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20985,20 +21175,20 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", - pdst->mp_pgno, page_numkeys(pdst), - page_fill(cdst->mc_txn->mt_env, pdst)); + DEBUG("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", + pdst->mp_pgno, page_numkeys(pdst), + page_fill(cdst->mc_txn->mt_env, pdst)); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); } /* Unlink the src page from parent and add to free list. */ csrc->mc_top--; - mdbx_node_del(csrc, 0); + node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); if (unlikely(rc)) { csrc->mc_top++; return rc; @@ -21006,8 +21196,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } csrc->mc_top++; - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); { /* Adjust other cursors pointing to mp */ @@ -21021,7 +21211,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { continue; if (m3->mc_pg[top] == psrc) { m3->mc_pg[top] = pdst; - mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); + cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); m3->mc_ki[top] += (indx_t)dst_nkeys; m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && @@ -21035,26 +21225,26 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { /* If not operating on GC, allow this page to be reused * in this txn. Otherwise just add to free list. */ - rc = mdbx_page_retire(csrc, (MDBX_page *)psrc); + rc = page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_top > 0); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; const unsigned save_snum = cdst->mc_snum; const uint16_t save_depth = cdst->mc_db->md_depth; - mdbx_cursor_pop(cdst); - rc = mdbx_rebalance(cdst); + cursor_pop(cdst); + rc = rebalance(cdst); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); #if MDBX_ENABLE_PGOP_STAT cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1; @@ -21062,23 +21252,23 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { /* LY: don't touch cursor if top-page is a LEAF */ - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } - mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); - if (unlikely(pagetype != PAGETYPE(top_page))) { + if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ goto bailout; } if (top_page == cdst->mc_pg[cdst->mc_top]) { /* LY: don't touch cursor if prev top-page already on the top */ - mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -21089,14 +21279,14 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } if (top_page == cdst->mc_pg[new_snum - 1]) { - mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx); + cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx); /* LY: restore cursor stack */ - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -21112,12 +21302,12 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_ki[new_snum - 1] = top_indx; cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -21128,14 +21318,15 @@ bailout: } static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(cdst, cdst->mc_dbi == csrc->mc_dbi); - mdbx_cassert(cdst, cdst->mc_txn == csrc->mc_txn); - mdbx_cassert(cdst, cdst->mc_db == csrc->mc_db); - mdbx_cassert(cdst, cdst->mc_dbx == csrc->mc_dbx); - mdbx_cassert(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi); + cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); + cASSERT(cdst, cdst->mc_db == csrc->mc_db); + cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); + cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; + cdst->mc_checking = csrc->mc_checking; for (unsigned i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -21147,8 +21338,8 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= - csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); + cASSERT(csrc, csrc->mc_txn->mt_txnid >= + csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); cdst->mc_dbi = csrc->mc_dbi; cdst->mc_next = NULL; cdst->mc_backup = NULL; @@ -21163,12 +21354,12 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Rebalance the tree after a delete operation. * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ -static int mdbx_rebalance(MDBX_cursor *mc) { - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, mc->mc_snum > 0); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]); +static int rebalance(MDBX_cursor *mc) { + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, mc->mc_snum > 0); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); const unsigned minkeys = (pagetype & P_BRANCH) + 1; @@ -21181,23 +21372,22 @@ static int mdbx_rebalance(MDBX_cursor *mc) { const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; const unsigned numkeys = page_numkeys(tp); const unsigned room = page_room(tp); - mdbx_debug("rebalancing %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + DEBUG("rebalancing %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); if (unlikely(numkeys < minkeys)) { - mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold", - tp->mp_pgno, minkeys); + DEBUG("page %" PRIaPGNO " must be merged due keys < %u threshold", + tp->mp_pgno, minkeys); } else if (unlikely(room > room_threshold)) { - mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold", - tp->mp_pgno, room, room_threshold); + DEBUG("page %" PRIaPGNO " should be merged due room %u > %u threshold", + tp->mp_pgno, room, room_threshold); } else { - mdbx_debug("no need to rebalance page %" PRIaPGNO - ", room %u < %u threshold", - tp->mp_pgno, room, room_threshold); - mdbx_cassert(mc, mc->mc_db->md_entries > 0); + DEBUG("no need to rebalance page %" PRIaPGNO ", room %u < %u threshold", + tp->mp_pgno, room, room_threshold); + cASSERT(mc, mc->mc_db->md_entries > 0); return MDBX_SUCCESS; } @@ -21205,21 +21395,21 @@ static int mdbx_rebalance(MDBX_cursor *mc) { if (mc->mc_snum < 2) { MDBX_page *const mp = mc->mc_pg[0]; const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); + cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); if (IS_SUBP(mp)) { - mdbx_debug("%s", "Can't rebalance a subpage, ignoring"); - mdbx_cassert(mc, pagetype & P_LEAF); + DEBUG("%s", "Can't rebalance a subpage, ignoring"); + cASSERT(mc, pagetype & P_LEAF); return MDBX_SUCCESS; } if (nkeys == 0) { - mdbx_cassert(mc, IS_LEAF(mp)); - mdbx_debug("%s", "tree is completely empty"); - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); + cASSERT(mc, IS_LEAF(mp)); + DEBUG("%s", "tree is completely empty"); + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; - mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 && - mc->mc_db->md_overflow_pages == 0 && - mc->mc_db->md_leaf_pages == 1); + cASSERT(mc, mc->mc_db->md_branch_pages == 0 && + mc->mc_db->md_overflow_pages == 0 && + mc->mc_db->md_leaf_pages == 1); /* Adjust cursors pointing to mp */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { @@ -21237,14 +21427,13 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - rc = mdbx_page_retire(mc, mp); + rc = page_retire(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (IS_BRANCH(mp) && nkeys == 1) { - mdbx_debug("%s", "collapsing root page!"); + DEBUG("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); - rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], - pp_txnid4chk(mp, mc->mc_txn)); + rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_db->md_depth--; @@ -21270,18 +21459,17 @@ static int mdbx_rebalance(MDBX_cursor *mc) { m3->mc_top--; } } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || + PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - rc = mdbx_page_retire(mc, mp); + rc = page_retire(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else { - mdbx_debug("root page %" PRIaPGNO - " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); } return MDBX_SUCCESS; } @@ -21289,9 +21477,9 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* The parent (branch page) must have at least 2 pointers, * otherwise the tree is invalid. */ const unsigned pre_top = mc->mc_top - 1; - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top])); - mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0])); - mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); + cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); + cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); + cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or @@ -21303,22 +21491,22 @@ static int mdbx_rebalance(MDBX_cursor *mc) { MDBX_page *left = nullptr, *right = nullptr; if (mn.mc_ki[pre_top] > 0) { - rc = mdbx_page_get( + rc = page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), - &left, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { - rc = mdbx_page_get( + rc = page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), - &right, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - mdbx_cassert(mc, left || right); + cASSERT(mc, left || right); const unsigned ki_top = mc->mc_ki[mc->mc_top]; const unsigned ki_pre_top = mn.mc_ki[pre_top]; @@ -21331,33 +21519,33 @@ static int mdbx_rebalance(MDBX_cursor *mc) { retry: if (left_room > room_threshold && left_room >= right_room) { /* try merge with left */ - mdbx_cassert(mc, left_nkeys >= minkeys); + cASSERT(mc, left_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; const unsigned new_ki = ki_top + left_nkeys; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { cursor_restore(&mn, mc); mc->mc_ki[mc->mc_top] = (indx_t)new_ki; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (right_room > room_threshold) { /* try merge with right */ - mdbx_cassert(mc, right_nkeys >= minkeys); + cASSERT(mc, right_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); + WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } @@ -21369,10 +21557,10 @@ retry: mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } @@ -21382,93 +21570,166 @@ retry: mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (nkeys >= minkeys) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - if (!mdbx_audit_enabled()) - return MDBX_SUCCESS; - return mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) + return cursor_check_updating(mc); + return MDBX_SUCCESS; } if (likely(room_threshold > 0)) { room_threshold = 0; goto retry; } - mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + ERROR("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); return MDBX_PROBLEM; } -__cold static int mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, unsigned options) { +__cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { DKBUF; - options |= mc->mc_flags; - MDBX_env *const env = mc->mc_txn->mt_env; - const unsigned nkeys = page_numkeys(mp); - char *const end_of_page = (char *)mp + env->me_psize; + int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) - return bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); - if (IS_OVERFLOW(mp)) { - if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) - return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); - if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) - return bad_page(mp, "overflow page beyond (%u) next-pgno\n", - mp->mp_pgno + mp->mp_pages); - if (unlikely((options & (C_SUB | C_COPYING)) == C_SUB)) - return bad_page(mp, - "unexpected overflow-page for dupsort db (flags 0x%x)\n", - mc->mc_db->md_flags); - return MDBX_SUCCESS; + rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); + + MDBX_env *const env = mc->mc_txn->mt_env; + const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb; + unsigned flags_mask = P_ILL_BITS; + unsigned flags_expected = 0; + if (offset < 0 || + offset > (ptrdiff_t)(env->me_dxb_mmap.current - ((mp->mp_flags & P_SUBP) + ? PAGEHDRSZ + 1 + : env->me_psize))) { + /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ + flags_mask -= P_SUBP; + if ((env->me_flags & MDBX_WRITEMAP) != 0 || + (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP))) + rc = bad_page(mp, "invalid page-address %p, offset %zi\n", + __Wpedantic_format_voidptr(mp), offset); + } else if (offset & (env->me_psize - 1)) + flags_expected = P_SUBP; + + if (unlikely((mp->mp_flags & flags_mask) != flags_expected)) + rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", + mp->mp_flags & flags_mask, flags_expected); + + cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); + const uint8_t type = PAGETYPE_WHOLE(mp); + switch (type) { + default: + return bad_page(mp, "invalid type (%u)\n", type); + case P_OVERFLOW: + if (unlikely(mc->mc_flags & C_SUB)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", + "nested dupsort tree", mc->mc_db->md_flags); + const pgno_t npages = mp->mp_pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) + rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) + rc = bad_page( + mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno); + return rc; //-------------------------- end of large/overflow page handling + case P_LEAF | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF: + if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) + rc = bad_page( + mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_LEAF | P_LEAF2 | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF | P_LEAF2: + if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) + rc = bad_page( + mp, + "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_BRANCH: + break; } - int rc = MDBX_SUCCESS; - if ((options & C_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { - if (unlikely(nkeys < 2 && IS_BRANCH(mp))) - rc = bad_page(mp, "branch-page nkey (%u) < 2\n", nkeys); + if (unlikely(mp->mp_upper < mp->mp_lower || + ((mp->mp_lower | mp->mp_upper) & 1) || + PAGEHDRSZ + mp->mp_upper > env->me_psize)) + rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %u\n", + mp->mp_lower, mp->mp_upper, page_space(env)); + + char *const end_of_page = (char *)mp + env->me_psize; + const unsigned nkeys = page_numkeys(mp); + STATIC_ASSERT(P_BRANCH == 1); + if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { + if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && + (!(mc->mc_checking & CC_UPDATING) || + !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) + rc = + bad_page(mp, "%s-page nkeys (%u) < %u\n", + IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + } + if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + + nkeys * sizeof(MDBX_node) + nkeys - 1 > + env->me_psize)) + rc = bad_page(mp, "invalid page upper (%u) for nkeys %u with limit %u\n", + mp->mp_upper, nkeys, page_space(env)); + + const size_t ksize_max = keysize_max(env->me_psize, 0); + const size_t leaf2_ksize = mp->mp_leaf2_ksize; + if (IS_LEAF2(mp)) { + if (unlikely((mc->mc_flags & C_SUB) == 0 || + (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) + rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", + mc->mc_db->md_flags); + if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) + rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); } - if (IS_LEAF2(mp) && unlikely((options & (C_SUB | C_COPYING)) == 0)) - rc = bad_page(mp, "unexpected leaf2-page (db flags 0x%x)\n", - mc->mc_db->md_flags); MDBX_val here, prev = {0, 0}; for (unsigned i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { - const size_t ksize = mp->mp_leaf2_ksize; - char *const key = page_leaf2key(mp, i, ksize); - if (unlikely(end_of_page < key + ksize)) { + char *const key = page_leaf2key(mp, i, leaf2_ksize); + if (unlikely(end_of_page < key + leaf2_ksize)) { rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", - key + ksize - end_of_page); + key + leaf2_ksize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(ksize != mc->mc_dbx->md_klen_min)) { - if (unlikely(ksize < mc->mc_dbx->md_klen_min || - ksize > mc->mc_dbx->md_klen_max)) - rc = bad_page( - mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", - ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - else - mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = ksize; - } - if ((options & C_SKIPORD) == 0) { - here.iov_len = ksize; - here.iov_base = key; - if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, - DKEY(&prev), DVAL(&here)); - prev = here; - } + if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { + if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || + leaf2_ksize > mc->mc_dbx->md_klen_max)) + rc = bad_page( + mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); + else + mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_len = leaf2_ksize; + here.iov_base = key; + if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; } } else { const MDBX_node *const node = page_node(mp, i); @@ -21478,20 +21739,22 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, node_end - end_of_page); continue; } - size_t ksize = node_ks(node); + const size_t ksize = node_ks(node); + if (unlikely(ksize > ksize_max)) + rc = bad_page(mp, "node[%u] too long key (%zu)\n", i, ksize); char *key = node_key(node); if (unlikely(end_of_page < key + ksize)) { rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); continue; } - if ((IS_LEAF(mp) || i > 0) && (options & C_COPYING) == 0) { + if ((IS_LEAF(mp) || i > 0)) { if (unlikely(ksize < mc->mc_dbx->md_klen_min || ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - if ((options & C_SKIPORD) == 0) { + if ((mc->mc_checking & CC_SKIPORD) == 0) { here.iov_base = key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) @@ -21501,14 +21764,16 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, } } if (IS_BRANCH(mp)) { - if ((options & C_UPDATING) == 0 && i == 0 && unlikely(ksize != 0)) + if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && + unlikely(ksize != 0)) rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", i, ksize); - if ((options & C_RETIRING) == 0) { - const pgno_t ref = node_pgno(node); - if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno)) - rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); - } + const pgno_t ref = node_pgno(node); + if (unlikely(ref < MIN_PAGENO) || + (unlikely(ref >= mc->mc_txn->mt_next_pgno) && + (unlikely(ref >= mc->mc_txn->mt_geo.now) || + !(mc->mc_checking & CC_RETIRING)))) + rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); if (unlikely(node_flags(node))) rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, node_flags(node)); @@ -21536,29 +21801,33 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) - rc = bad_page( - mp, - "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - } - if ((options & C_RETIRING) == 0) { - MDBX_page *lp; - int err = mdbx_page_get(mc, node_largedata_pgno(node), &lp, - pp_txnid4chk(mp, mc->mc_txn)); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (unlikely(!IS_OVERFLOW(lp))) { - rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n", - lp->mp_pgno); - continue; + if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) + rc = bad_page( + mp, + "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + if (unlikely(node_size_len(node_ks(node), dsize) <= + mc->mc_txn->mt_env->me_leaf_nodemax)) + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + + if ((mc->mc_checking & CC_RETIRING) == 0) { + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + rc = bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else + poor_page(lp.page, + "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } - if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) - rc = - bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", - dsize, lp->mp_pages); } continue; } @@ -21575,14 +21844,12 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, /* wrong, but already handled */ continue; case 0 /* usual */: - if ((options & C_COPYING) == 0) { - if (unlikely(dsize < mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page( - mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - continue; - } + if (unlikely(dsize < mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) { + rc = bad_page( + mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + continue; } break; case F_SUBDATA /* sub-db */: @@ -21604,9 +21871,8 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } else { const MDBX_page *const sp = (MDBX_page *)data; - const char *const end_of_subpage = data + dsize; - const int nsubkeys = page_numkeys(sp); - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & + /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: case P_LEAF | P_LEAF2 | P_SUBP: break; @@ -21616,6 +21882,13 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } + const char *const end_of_subpage = data + dsize; + const int nsubkeys = page_numkeys(sp); + if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && + mc->mc_db->md_entries) + rc = bad_page(mp, "no keys on a %s-page\n", + IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub"); + MDBX_val sub_here, sub_prev = {0, 0}; for (int j = 0; j < nsubkeys; j++) { if (IS_LEAF2(sp)) { @@ -21628,29 +21901,26 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page(mp, - "nested-leaf2-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - continue; - } + if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-leaf2-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + else mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; - } - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-leaf2-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } } else { const MDBX_node *const sub_node = page_node(sp, j); @@ -21669,25 +21939,22 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ - if ((options & C_COPYING) == 0) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-node-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, - "nested-node-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-node-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + "nested-node-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } if (unlikely(sub_dsize != 0)) rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", @@ -21705,19 +21972,21 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, return rc; } -__cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { - mdbx_cassert(mc, - mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - (mc->mc_txn->mt_parent - ? mc->mc_txn->mt_parent->tw.dirtyroom - : mc->mc_txn->mt_env->me_options.dp_limit)); - mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || (options & C_UPDATING)); - if (unlikely(mc->mc_top != mc->mc_snum - 1) && (options & C_UPDATING) == 0) +__cold static int cursor_check(MDBX_cursor *mc) { + cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == + (mc->mc_txn->mt_parent + ? mc->mc_txn->mt_parent->tw.dirtyroom + : mc->mc_txn->mt_env->me_options.dp_limit)); + cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); + if (unlikely(mc->mc_top != mc->mc_snum - 1) && + (mc->mc_checking & CC_UPDATING) == 0) return MDBX_CURSOR_FULL; - mdbx_cassert(mc, (options & C_UPDATING) ? mc->mc_snum <= mc->mc_db->md_depth - : mc->mc_snum == mc->mc_db->md_depth); - if (unlikely((options & C_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth - : mc->mc_snum != mc->mc_db->md_depth)) + cASSERT(mc, (mc->mc_checking & CC_UPDATING) + ? mc->mc_snum <= mc->mc_db->md_depth + : mc->mc_snum == mc->mc_db->md_depth); + if (unlikely((mc->mc_checking & CC_UPDATING) + ? mc->mc_snum > mc->mc_db->md_depth + : mc->mc_snum != mc->mc_db->md_depth)) return MDBX_CURSOR_FULL; for (int n = 0; n < (int)mc->mc_snum; ++n) { @@ -21727,44 +21996,43 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { const bool expect_nested_leaf = (n + 1 == mc->mc_db->md_depth - 1) ? true : false; const bool branch = IS_BRANCH(mp) ? true : false; - mdbx_cassert(mc, branch == expect_branch); + cASSERT(mc, branch == expect_branch); if (unlikely(branch != expect_branch)) return MDBX_CURSOR_FULL; - if ((options & C_UPDATING) == 0) { - mdbx_cassert(mc, - nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0)); + if ((mc->mc_checking & CC_UPDATING) == 0) { + cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0)); if (unlikely(nkeys <= mc->mc_ki[n] && !(!branch && nkeys == mc->mc_ki[n] && (mc->mc_flags & C_EOF) != 0))) return MDBX_CURSOR_FULL; } else { - mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]); + cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]); if (unlikely(nkeys + 1 < mc->mc_ki[n])) return MDBX_CURSOR_FULL; } - int err = mdbx_page_check(mc, mp, options); + int err = page_check(mc, mp); if (unlikely(err != MDBX_SUCCESS)) return err; for (unsigned i = 0; i < nkeys; ++i) { if (branch) { MDBX_node *node = page_node(mp, i); - mdbx_cassert(mc, node_flags(node) == 0); + cASSERT(mc, node_flags(node) == 0); if (unlikely(node_flags(node) != 0)) return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; - int rc = mdbx_page_get(mc, pgno, &np, pp_txnid4chk(mp, mc->mc_txn)); - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = page_get(mc, pgno, &np, mp->mp_txnid); + cASSERT(mc, err == MDBX_SUCCESS); + if (unlikely(err != MDBX_SUCCESS)) + return err; const bool nested_leaf = IS_LEAF(np) ? true : false; - mdbx_cassert(mc, nested_leaf == expect_nested_leaf); + cASSERT(mc, nested_leaf == expect_nested_leaf); if (unlikely(nested_leaf != expect_nested_leaf)) return MDBX_CURSOR_FULL; - err = mdbx_page_check(mc, np, options); + err = page_check(mc, np); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -21773,19 +22041,27 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_SUCCESS; } +__cold static int cursor_check_updating(MDBX_cursor *mc) { + const uint8_t checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int rc = cursor_check(mc); + mc->mc_checking = checking; + return rc; +} + /* Complete a delete operation started by mdbx_cursor_del(). */ -static int mdbx_cursor_del0(MDBX_cursor *mc) { +static int cursor_del(MDBX_cursor *mc) { int rc; MDBX_page *mp; indx_t ki; unsigned nkeys; MDBX_dbi dbi = mc->mc_dbi; - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; - mdbx_node_del(mc, mc->mc_db->md_xsize); + node_del(mc, mc->mc_db->md_xsize); mc->mc_db->md_entries--; /* Adjust other cursors pointing to mp */ @@ -21811,27 +22087,27 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } } - rc = mdbx_rebalance(mc); + rc = rebalance(mc); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (unlikely(!mc->mc_snum)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done - * by mdbx_rebalance and aren't needed here. */ - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); mc->mc_flags |= C_EOF; return MDBX_SUCCESS; } ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); + cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); /* Adjust this and other cursors pointing to mp */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -21843,7 +22119,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + rc = cursor_sibling(m3, SIBLING_RIGHT); if (rc == MDBX_NOTFOUND) { m3->mc_flags |= C_EOF; rc = MDBX_SUCCESS; @@ -21866,10 +22142,10 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { if (!(node_flags(node) & F_SUBDATA)) m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -21881,9 +22157,9 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } } - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); return rc; bailout: @@ -21906,21 +22182,21 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - return mdbx_del0(txn, dbi, key, data, 0); + return delete (txn, dbi, key, data, 0); } -static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { +static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { MDBX_cursor_couple cx; MDBX_cursor_op op; MDBX_val rdata; int rc; DKBUF_DEBUG; - mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), + DVAL_DEBUG(data)); - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -21932,7 +22208,7 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, op = MDBX_SET; flags |= MDBX_ALLDUPS; } - rc = mdbx_cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; if (likely(rc == MDBX_SUCCESS)) { /* let mdbx_page_split know about this cursor if needed: * delete will trigger a rebalance; if it needs to move @@ -21957,11 +22233,11 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * [in] newkey The key for the newly inserted node. * [in] newdata The data for the newly inserted node. * [in] newpgno The page number, if the new node is a branch node. - * [in] nflags The NODE_ADD_FLAGS for the new node. + * [in] naf The NODE_ADD_FLAGS for the new node. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, - MDBX_val *const newdata, pgno_t newpgno, - unsigned nflags) { +static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, + MDBX_val *const newdata, pgno_t newpgno, + const unsigned naf) { unsigned flags; int rc = MDBX_SUCCESS, foliage = 0; unsigned i, ptop; @@ -21973,41 +22249,41 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, MDBX_page *const mp = mc->mc_pg[mc->mc_top]; const unsigned newindx = mc->mc_ki[mc->mc_top]; unsigned nkeys = page_numkeys(mp); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } STATIC_ASSERT(P_BRANCH == 1); const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; - mdbx_debug(">> splitting %s-page %" PRIaPGNO - " and adding %zu+%zu [%s] at %i, nkeys %i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, - newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), - mc->mc_ki[mc->mc_top], nkeys); - mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); + DEBUG(">> splitting %s-page %" PRIaPGNO + " and adding %zu+%zu [%s] at %i, nkeys %i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, + newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), + mc->mc_ki[mc->mc_top], nkeys); + cASSERT(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ - struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1); + pgr_t npr = page_new(mc, mp->mp_flags); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; MDBX_page *const sister = npr.page; sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno); + DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno); /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdbx_update_key, + * height is 1. But when called from update_key, * the cursor height may be greater because it walks * up the stack while finding the branch slot to update. */ if (mc->mc_top < 1) { - npr = mdbx_page_new(mc, P_BRANCH, 1); + npr = page_new(mc, P_BRANCH); rc = npr.err; if (unlikely(rc != MDBX_SUCCESS)) goto done; MDBX_page *const pp = npr.page; /* shift current top to make room for new parent */ - mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); + cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); #if MDBX_DEBUG memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); @@ -22019,11 +22295,11 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - mdbx_debug("root split! new root = %" PRIaPGNO, pp->mp_pgno); + DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno); foliage = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ - rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno); + rc = node_add_branch(mc, 0, NULL, mp->mp_pgno); if (unlikely(rc != MDBX_SUCCESS)) { /* undo the pre-push */ mc->mc_pg[0] = mc->mc_pg[1]; @@ -22035,14 +22311,14 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_snum++; mc->mc_top++; ptop = 0; - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { ptop = mc->mc_top - 1; - mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); + DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); } MDBX_cursor mn; @@ -22053,14 +22329,15 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, unsigned split_indx = (newindx < nkeys) - ? /* split at the middle */ (nkeys + 1) / 2 + ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); - mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0); + cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; - if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) { + if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ @@ -22079,7 +22356,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); } else get_key(page_node(mp, 0), &sepkey); - mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); + cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); /* Avoiding rare complex cases of split the parent page */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) split_indx = minkeys; @@ -22091,160 +22368,163 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, const bool pure_left = split_indx == 0; if (unlikely(pure_right)) { /* newindx == split_indx == nkeys */ - mdbx_trace("no-split, but add new pure page at the %s", "right/after"); - mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); + TRACE("no-split, but add new pure page at the %s", "right/after"); + cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ - mdbx_trace("no-split, but add new pure page at the %s", "left/before"); - mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey)); + TRACE("no-split, but add new pure page at the %s", "left/before"); + cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); + TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { char *split, *ins; unsigned lsize, rsize, ksize; /* Move half of the keys to the right sibling */ - const int x = mc->mc_ki[mc->mc_top] - split_indx; + const int distance = mc->mc_ki[mc->mc_top] - split_indx; ksize = mc->mc_db->md_xsize; split = page_leaf2key(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; lsize = (nkeys - split_indx) * sizeof(indx_t); - mdbx_cassert(mc, mp->mp_lower >= lsize); + cASSERT(mc, mp->mp_lower >= lsize); mp->mp_lower -= (indx_t)lsize; - mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX); + cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); sister->mp_lower += (indx_t)lsize; - mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); + cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); mp->mp_upper += (indx_t)(rsize - lsize); - mdbx_cassert(mc, sister->mp_upper >= rsize - lsize); + cASSERT(mc, sister->mp_upper >= rsize - lsize); sister->mp_upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; - if (x < 0) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); + if (distance < 0) { + cASSERT(mc, ksize >= sizeof(indx_t)); ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); sepkey.iov_base = sister->mp_ptrs; memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); - mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); + cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { - memcpy(sister->mp_ptrs, split, x * ksize); - ins = page_leaf2key(sister, x, ksize); + memcpy(sister->mp_ptrs, split, distance * ksize); + ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); - memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); - mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); + memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); + cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mdbx_cassert(mc, x <= (int)UINT16_MAX); - mc->mc_ki[mc->mc_top] = (indx_t)x; + cASSERT(mc, distance <= (int)UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)distance; } - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; - rc = mdbx_cursor_check(&mn, C_UPDATING); + rc = cursor_check_updating(&mn); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { - /* Maximum free space in an empty page */ - const unsigned max_space = page_space(env); - const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) - : branch_size(env, newkey); - /* grab a page to hold a temporary copy */ - tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1); + tmp_ki_copy = page_malloc(mc->mc_txn, 1); if (unlikely(tmp_ki_copy == NULL)) { rc = MDBX_ENOMEM; goto done; } + const unsigned max_space = page_space(env); + const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) + : branch_size(env, newkey); + /* prepare to insert */ - for (unsigned j = i = 0; i < nkeys; ++i, ++j) { - tmp_ki_copy->mp_ptrs[j] = 0; - j += (i == newindx); - tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i]; - } + for (i = 0; i < newindx; ++i) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i]; + tmp_ki_copy->mp_ptrs[i] = (indx_t)-1; + while (++i <= nkeys) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1]; tmp_ki_copy->mp_pgno = mp->mp_pgno; tmp_ki_copy->mp_flags = mp->mp_flags; tmp_ki_copy->mp_txnid = INVALID_TXNID; tmp_ki_copy->mp_lower = 0; tmp_ki_copy->mp_upper = (indx_t)max_space; - /* When items are relatively large the split point needs - * to be checked, because being off-by-one will make the - * difference between success or failure in mdbx_node_add. + /* Добавляемый узел может не поместиться в страницу-половину вместе + * с количественной половиной узлов из исходной страницы. В худшем случае, + * в страницу-половину с добавляемым узлом могут попасть самые больше узлы + * из исходной страницы, а другую половину только узлы с самыми короткими + * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу + * разреза требуется итерировать узлы и считая их объем. * - * It's also relevant if a page happens to be laid out - * such that one half of its nodes are all "small" and - * the other half of its nodes are "large". If the new - * item is also "large" and falls on the half with - * "large" nodes, it also may not fit. - * - * As a final tweak, if the new item goes on the last - * spot on the page (and thus, onto the new page), bias - * the split so the new page is emptier than the old page. - * This yields better packing during sequential inserts. */ + * Однако, при простом количественном делении (без учета размера ключей + * и данных) на страницах-половинах будет примерно вдвое меньше узлов. + * Поэтому добавляемый узел точно поместится, если его размер не больше + * чем место "освобождающееся" от заголовков узлов, которые переедут + * в другую страницу-половину. Кроме этого, как минимум по одному байту + * будет в каждом ключе, в худшем случае кроме одного, который может быть + * нулевого размера. */ - if (nkeys < 32 || new_size > max_space / 16) { - /* Find split point */ - int dir; - if (newindx <= split_indx) { - i = 0; - dir = 1; - } else { - i = nkeys; - dir = -1; - } + if (newindx == split_indx && nkeys >= 5) { + STATIC_ASSERT(P_BRANCH == 1); + split_indx += mp->mp_flags & P_BRANCH; + } + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + const unsigned dim_nodes = + (newindx >= split_indx) ? split_indx : nkeys - split_indx; + const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; + if (new_size >= dim_used) { + /* Search for best acceptable split point */ + i = (newindx < split_indx) ? 0 : nkeys; + int dir = (newindx < split_indx) ? 1 : -1; size_t before = 0, after = new_size + page_used(env, mp); - int best = split_indx; - int best_offset = nkeys + 1; + unsigned best_split = split_indx; + unsigned best_shift = INT_MAX; - mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, " - "new-size %zu", - i, dir, split_indx, newindx, new_size); + TRACE("seek separator from %u, step %i, default %u, new-idx %u, " + "new-size %zu", + i, dir, split_indx, newindx, new_size); do { - mdbx_cassert(mc, i <= nkeys); + cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { MDBX_node *node = (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); size = NODESIZE + node_ks(node) + sizeof(indx_t); if (IS_LEAF(mp)) - size += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t) - : node_ds(node); + size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) + : node_ds(node); size = EVEN(size); } before += size; after -= size; - mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i, - size, before, after, max_space); + TRACE("step %u, size %zu, before %zu, after %zu, max %u", i, size, + before, after, max_space); if (before <= max_space && after <= max_space) { - int offset = branchless_abs(split_indx - i); - if (offset >= best_offset) - break; - best_offset = offset; - best = i; + const unsigned split = i + (dir > 0); + if (split >= minkeys && split <= nkeys + 1 - minkeys) { + const unsigned shift = branchless_abs(split_indx - split); + if (shift >= best_shift) + break; + best_shift = shift; + best_split = split; + if (!best_shift) + break; + } } i += dir; } while (i < nkeys); - split_indx = best + (dir > 0); - split_indx = (split_indx <= nkeys - minkeys + 1) ? split_indx - : nkeys - minkeys + 1; - split_indx = (split_indx >= minkeys) ? split_indx : minkeys; - mdbx_trace("chosen %u", split_indx); + split_indx = best_split; + TRACE("chosen %u", split_indx); } + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); - sepkey.iov_len = newkey->iov_len; - sepkey.iov_base = newkey->iov_base; + sepkey = *newkey; if (split_indx != newindx) { MDBX_node *node = (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + @@ -22254,14 +22534,14 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } } - mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); + DEBUG("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); bool did_split_parent = false; /* Copy separator key to the parent. */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { - mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); - mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2); - mdbx_cassert(mc, !pure_left); + TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); + cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2); + cASSERT(mc, !pure_left); const int snum = mc->mc_snum; const int depth = mc->mc_db->md_depth; mn.mc_snum--; @@ -22269,12 +22549,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, did_split_parent = true; /* We want other splits to find mn when doing fixups */ WITH_CURSOR_TRACKING( - mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); + mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); if (unlikely(rc != MDBX_SUCCESS)) goto done; - mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -22296,10 +22576,10 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); + rc = cursor_sibling(mc, SIBLING_LEFT); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { - mdbx_error("unexpected %i error going left sibling", rc); + ERROR("unexpected %i error going left sibling", rc); rc = MDBX_PROBLEM; } goto done; @@ -22308,25 +22588,24 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s", - ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, - DKEY(mc->mc_ki[ptop] ? newkey : NULL)); + DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", + ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, + DKEY(mc->mc_ki[ptop] ? newkey : NULL)); mc->mc_top--; - rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop], - mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); - mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] && - newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); + rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, + sister->mp_pgno); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && + ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey)); + DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); - mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); + cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); mc->mc_ki[ptop] = 1; - rc = mdbx_update_key(mc, &sepkey); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); - mdbx_cassert(mc, - mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); + rc = update_key(mc, &sepkey); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); mc->mc_ki[ptop] = 0; } @@ -22335,13 +22614,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, goto done; MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); - mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno && - mc->mc_pg[ptop] == ptop_page); + cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); } else { mn.mc_top--; - mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page", - mn.mc_ki[ptop]); - rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); + TRACE("add-to-parent the right-entry[%u] for new sibling-page", + mn.mc_ki[ptop]); + rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); mn.mc_top++; if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -22350,18 +22628,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(pure_left | pure_right)) { mc->mc_pg[mc->mc_top] = sister; mc->mc_ki[mc->mc_top] = 0; - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf2(mc, 0, newkey); + cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_leaf2(mc, 0, newkey); } break; default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -22376,12 +22654,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { - mc->mc_top -= i; - mdbx_debug("update new-first on parent [%i] page %u key %s", - mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, - DKEY(newkey)); - rc = mdbx_update_key(mc, newkey); - mc->mc_top += i; + mc->mc_top -= (uint8_t)i; + DEBUG("update new-first on parent [%i] page %u key %s", + mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, + DKEY(newkey)); + rc = update_key(mc, newkey); + mc->mc_top += (uint8_t)i; if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -22393,19 +22671,17 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_pg[mc->mc_top] = sister; i = split_indx; unsigned n = 0; - pgno_t pgno = 0; do { - mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, - sister->mp_pgno); + TRACE("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno); + pgno_t pgno = 0; MDBX_val *rdata = NULL; if (i == newindx) { - rkey.iov_base = newkey->iov_base; - rkey.iov_len = newkey->iov_len; + rkey = *newkey; if (IS_LEAF(mp)) rdata = newdata; else pgno = newpgno; - flags = nflags; + flags = naf; /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = (indx_t)n; } else { @@ -22422,24 +22698,24 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, flags = node_flags(node); } - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_BRANCH: { - mdbx_cassert(mc, 0 == (uint16_t)flags); + cASSERT(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ - rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno); + rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); } break; case P_LEAF: { - mdbx_cassert(mc, pgno == 0); - mdbx_cassert(mc, rdata != NULL); - rc = mdbx_node_add_leaf(mc, n, &rkey, rdata, flags); + cASSERT(mc, pgno == 0); + cASSERT(mc, rdata != NULL); + rc = node_add_leaf(mc, n, &rkey, rdata, flags); } break; /* case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, gno == 0); + cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, gno == 0); rc = mdbx_node_add_leaf2(mc, n, &rkey); } break; */ default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -22449,12 +22725,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, i = 0; n = 0; mc->mc_pg[mc->mc_top] = tmp_ki_copy; - mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno); + TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno); } } while (i != split_indx); - mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, - mc->mc_pg[mc->mc_top]->mp_pgno); + TRACE("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, + mc->mc_pg[mc->mc_top]->mp_pgno); nkeys = page_numkeys(tmp_ki_copy); for (i = 0; i < nkeys; i++) @@ -22510,18 +22786,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[k + 1] = m3->mc_ki[k]; m3->mc_pg[k + 1] = m3->mc_pg[k]; } - m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0; + m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum++; m3->mc_top++; } if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE)) + if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = sister; - mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys); + cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys); m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; for (i = 0; i < mc->mc_top; i++) { m3->mc_ki[i] = mn.mc_ki[i]; @@ -22536,19 +22812,19 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (XCURSOR_INITED(m3) && IS_LEAF(mp)) XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } - mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), - sister->mp_pgno, page_room(sister)); + TRACE("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), + sister->mp_pgno, page_room(sister)); done: if (tmp_ki_copy) - mdbx_dpage_free(env, tmp_ki_copy, 1); + dpage_free(env, tmp_ki_copy, 1); if (unlikely(rc != MDBX_SUCCESS)) mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; else { - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); - if (unlikely(nflags & MDBX_RESERVE)) { + if (AUDIT_ENABLED()) + rc = cursor_check_updating(mc); + if (unlikely(naf & MDBX_RESERVE)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_BIGDATA)) newdata->iov_base = node_data(node); @@ -22558,7 +22834,7 @@ done: #endif /* MDBX_ENABLE_PGOP_STAT */ } - mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc); + DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc); return rc; } @@ -22583,7 +22859,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -22598,9 +22874,9 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /* LY: allows update (explicit overwrite) only for unique keys */ MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + if (node_flags(node) & F_DUPDATA) { + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); rc = MDBX_EMULTIVAL; } } @@ -22616,14 +22892,12 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /**** COPYING *****************************************************************/ /* State needed for a double-buffering compacting copy. */ -typedef struct mdbx_copy { +typedef struct mdbx_compacting_ctx { MDBX_env *mc_env; MDBX_txn *mc_txn; - mdbx_condpair_t mc_condpair; + osal_condpair_t mc_condpair; uint8_t *mc_wbuf[2]; - uint8_t *mc_over[2]; size_t mc_wlen[2]; - size_t mc_olen[2]; mdbx_filehandle_t mc_fd; /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ @@ -22631,39 +22905,38 @@ typedef struct mdbx_copy { pgno_t mc_next_pgno; volatile unsigned mc_head; volatile unsigned mc_tail; -} mdbx_copy; +} mdbx_compacting_ctx; /* Dedicated writer thread for compacting copy. */ -__cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { - mdbx_copy *my = arg; +__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { + mdbx_compacting_ctx *const ctx = arg; #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) sigset_t sigset; sigemptyset(&sigset); sigaddset(&sigset, SIGPIPE); - my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); #endif /* EPIPE */ - mdbx_condpair_lock(&my->mc_condpair); - while (!my->mc_error) { - while (my->mc_tail == my->mc_head && !my->mc_error) { - int err = mdbx_condpair_wait(&my->mc_condpair, true); + osal_condpair_lock(&ctx->mc_condpair); + while (!ctx->mc_error) { + while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { + int err = osal_condpair_wait(&ctx->mc_condpair, true); if (err != MDBX_SUCCESS) { - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } - const unsigned toggle = my->mc_tail & 1; - size_t wsize = my->mc_wlen[toggle]; + const unsigned toggle = ctx->mc_tail & 1; + size_t wsize = ctx->mc_wlen[toggle]; if (wsize == 0) { - my->mc_tail += 1; + ctx->mc_tail += 1; break /* EOF */; } - my->mc_wlen[toggle] = 0; - uint8_t *ptr = my->mc_wbuf[toggle]; - again: - if (!my->mc_error) { - int err = mdbx_write(my->mc_fd, ptr, wsize); + ctx->mc_wlen[toggle] = 0; + uint8_t *ptr = ctx->mc_wbuf[toggle]; + if (!ctx->mc_error) { + int err = osal_write(ctx->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) if (err == EPIPE) { @@ -22673,134 +22946,158 @@ __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { sigwait(&sigset, &unused); } #endif /* EPIPE */ - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } - - /* If there's an overflow page tail, write it too */ - wsize = my->mc_olen[toggle]; - if (wsize) { - my->mc_olen[toggle] = 0; - ptr = my->mc_over[toggle]; - goto again; - } - my->mc_tail += 1; - mdbx_condpair_signal(&my->mc_condpair, false); + ctx->mc_tail += 1; + osal_condpair_signal(&ctx->mc_condpair, false); } bailout: - mdbx_condpair_unlock(&my->mc_condpair); + osal_condpair_unlock(&ctx->mc_condpair); return (THREAD_RESULT)0; } /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ -__cold static int mdbx_env_cthr_toggle(mdbx_copy *my) { - mdbx_condpair_lock(&my->mc_condpair); - mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); - my->mc_head += 1; - mdbx_condpair_signal(&my->mc_condpair, true); - while (!my->mc_error && - my->mc_head - my->mc_tail == 2 /* both buffers in use */) { - int err = mdbx_condpair_wait(&my->mc_condpair, false); +__cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { + osal_condpair_lock(&ctx->mc_condpair); + eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); + ctx->mc_head += 1; + osal_condpair_signal(&ctx->mc_condpair, true); + while (!ctx->mc_error && + ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { + int err = osal_condpair_wait(&ctx->mc_condpair, false); if (err != MDBX_SUCCESS) - my->mc_error = err; + ctx->mc_error = err; } - mdbx_condpair_unlock(&my->mc_condpair); - return my->mc_error; + osal_condpair_unlock(&ctx->mc_condpair); + return ctx->mc_error; } -/* Depth-first tree traversal for compacting copy. - * [in] my control structure. - * [in,out] pg database root. - * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ -__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { - MDBX_cursor_couple couple; - MDBX_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc; - unsigned i; +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDBX_SUCCESS; +static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, + size_t bytes, pgno_t pgno, pgno_t npages) { + assert(pgno == 0 || bytes > PAGEHDRSZ); + while (bytes > 0) { + const unsigned side = ctx->mc_head & 1; + const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; + if (left < (pgno ? PAGEHDRSZ : 1)) { + int err = compacting_toggle_write_buffers(ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + continue; + } + const size_t chunk = (bytes < left) ? bytes : left; + void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side]; + if (src) { + memcpy(dst, src, chunk); + if (pgno) { + assert(chunk > PAGEHDRSZ); + MDBX_page *mp = dst; + mp->mp_pgno = pgno; + if (mp->mp_txnid == 0) + mp->mp_txnid = ctx->mc_txn->mt_txnid; + if (mp->mp_flags == P_OVERFLOW) { + assert(bytes <= pgno2bytes(ctx->mc_env, npages)); + mp->mp_pages = npages; + } + pgno = 0; + } + src = (const char *)src + chunk; + } else + memset(dst, 0, chunk); + bytes -= chunk; + ctx->mc_wlen[side] += chunk; + } + return MDBX_SUCCESS; +} - memset(&couple, 0, sizeof(couple)); - couple.outer.mc_snum = 1; - couple.outer.mc_txn = my->mc_txn; - couple.outer.mc_flags = couple.inner.mx_cursor.mc_flags = - C_COPYING | C_SKIPORD; +static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, + const size_t head_bytes, const size_t tail_bytes, + const pgno_t npages) { + if (tail_bytes) { + assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); + assert(npages == 1 && + (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF)); + } else { + assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); + assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) || + PAGETYPE_WHOLE(mp) == P_OVERFLOW); + } - rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], - my->mc_txn->mt_txnid); + const pgno_t pgno = ctx->mc_next_pgno; + ctx->mc_next_pgno += npages; + int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = compacting_put_bytes( + ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes), + 0, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return compacting_put_bytes( + ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0, + 0); +} + +__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, + MDBX_cursor *mc, pgno_t *root, + txnid_t parent_txnid) { + mc->mc_snum = 1; + int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST); + + rc = page_search_root(mc, nullptr, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ - buf = ptr = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); + char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; - for (i = 0; i < couple.outer.mc_top; i++) { - mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], - my->mc_env->me_psize); - couple.outer.mc_pg[i] = (MDBX_page *)ptr; - ptr += my->mc_env->me_psize; + char *ptr = buf; + for (unsigned i = 0; i < mc->mc_top; i++) { + page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + mc->mc_pg[i] = (MDBX_page *)ptr; + ptr += ctx->mc_env->me_psize; } - /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDBX_page *)ptr; + MDBX_page *const leaf = (MDBX_page *)ptr; - while (couple.outer.mc_snum > 0) { - mp = couple.outer.mc_pg[couple.outer.mc_top]; + while (mc->mc_snum > 0) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i = 0; i < n; i++) { + if (!(mc->mc_flags & + C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { + for (unsigned i = 0; i < n; i++) { MDBX_node *node = page_node(mp, i); - if (node_flags(node) & F_BIGDATA) { - MDBX_page *omp; - + if (node_flags(node) == F_BIGDATA) { /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } - const pgno_t pgno = node_largedata_pgno(node); - poke_pgno(node_data(node), my->mc_next_pgno); - rc = mdbx_page_get(&couple.outer, pgno, &omp, - pp_txnid4chk(mp, my->mc_txn)); + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) != MDBX_SUCCESS)) + goto done; + const size_t datasize = node_ds(node); + const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize); + poke_pgno(node_data(node), ctx->mc_next_pgno); + rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, + npages); if (unlikely(rc != MDBX_SUCCESS)) goto done; - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_env->me_psize > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; - } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); - my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; - } } else if (node_flags(node) & F_SUBDATA) { - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; goto done; @@ -22808,75 +23105,118 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } - MDBX_db db; - memcpy(&db, node_data(node), sizeof(MDBX_db)); - rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); - if (rc) + MDBX_db *nested = nullptr; + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); + if (likely(rc == MDBX_SUCCESS)) { + nested = &mc->mc_xcursor->mx_db; + rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, + &nested->md_root, mp->mp_txnid); + } + } else { + cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); + MDBX_cursor_couple *couple = + container_of(mc, MDBX_cursor_couple, outer); + cASSERT(mc, + couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE && + !couple->inner.mx_cursor.mc_flags && + !couple->inner.mx_cursor.mc_db && + !couple->inner.mx_cursor.mc_dbx); + nested = &couple->inner.mx_db; + memcpy(nested, node_data(node), sizeof(MDBX_db)); + rc = compacting_walk_sdb(ctx, nested); + } + if (unlikely(rc != MDBX_SUCCESS)) goto done; - memcpy(node_data(node), &db, sizeof(MDBX_db)); + memcpy(node_data(node), nested, sizeof(MDBX_db)); } } } } else { - couple.outer.mc_ki[couple.outer.mc_top]++; - if (couple.outer.mc_ki[couple.outer.mc_top] < n) { - again: - rc = mdbx_page_get( - &couple.outer, - node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])), - &mp, pp_txnid4chk(mp, my->mc_txn)); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - couple.outer.mc_top++; - couple.outer.mc_snum++; - couple.outer.mc_ki[couple.outer.mc_top] = 0; - if (IS_BRANCH(mp)) { + mc->mc_ki[mc->mc_top]++; + if (mc->mc_ki[mc->mc_top] < n) { + while (1) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + mc->mc_top++; + mc->mc_snum++; + mc->mc_ki[mc->mc_top] = 0; + if (!IS_BRANCH(mp)) { + mc->mc_pg[mc->mc_top] = mp; + break; + } /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ - mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp, - my->mc_env->me_psize); - goto again; - } else - couple.outer.mc_pg[couple.outer.mc_top] = mp; + page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); + } continue; } } - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; + + const pgno_t pgno = ctx->mc_next_pgno; + if (likely(!IS_LEAF2(mp))) { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + mp->mp_lower, + ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1); + } else { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1); } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdbx_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (couple.outer.mc_top) { + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + + if (mc->mc_top) { /* Update parent if there is one */ - node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], - couple.outer.mc_ki[couple.outer.mc_top - 1]), - mo->mp_pgno); - mdbx_cursor_pop(&couple.outer); + node_set_pgno( + page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), + pgno); + cursor_pop(mc); } else { /* Otherwise we're done */ - *pg = mo->mp_pgno; + *root = pgno; break; } } done: - mdbx_free(buf); + osal_free(buf); return rc; } -__cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { + if (unlikely(sdb->md_root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ + + MDBX_cursor_couple couple; + memset(&couple, 0, sizeof(couple)); + couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; + MDBX_dbx dbx = {.md_klen_min = INT_MAX}; + uint8_t dbistate = DBI_VALID | DBI_AUDITED; + int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + if (!sdb->md_mod_txnid) + sdb->md_mod_txnid = ctx->mc_txn->mt_txnid; + return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, + sdb->md_mod_txnid); +} + +__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { + eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || + meta->mm_dbs[FREE_DBI].md_root == P_INVALID); + eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || + meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); + /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; @@ -22896,11 +23236,11 @@ __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Update signature */ assert(meta->mm_geo.now >= meta->mm_geo.next); - unaligned_poke_u64(4, meta->mm_datasync_sign, meta_sign(meta)); + unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); } /* Make resizeable */ -__cold static void make_sizeable(MDBX_meta *meta) { +__cold static void meta_make_sizeable(MDBX_meta *meta) { meta->mm_geo.lower = MIN_PAGENO; if (meta->mm_geo.grow_pv == 0) { const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; @@ -22913,90 +23253,120 @@ __cold static void make_sizeable(MDBX_meta *meta) { } /* Copy environment with compaction. */ -__cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); - MDBX_meta *const meta = mdbx_init_metas(env, buffer); + MDBX_meta *const meta = init_metas(env, buffer); meta_set_txnid(env, meta, read_txn->mt_txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_sizeable(meta); + meta_make_sizeable(meta); /* copy canary sequences if present */ if (read_txn->mt_canary.v) { meta->mm_canary = read_txn->mt_canary; - meta->mm_canary.v = constmeta_txnid(env, meta); + meta->mm_canary.v = constmeta_txnid(meta); } - /* Set metapage 1 with current main DB */ - pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root; - if ((new_root = root) == P_INVALID) { + if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; - compact_fixup_meta(env, meta); + compacting_fixup_meta(env, meta); if (dest_is_pipe) { - int rc = mdbx_write(fd, buffer, meta_bytes); - if (rc != MDBX_SUCCESS) + int rc = osal_write(fd, buffer, meta_bytes); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else { - /* Count free pages + GC pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. */ - pgno_t freecount = 0; + /* Count free pages + GC pages. */ MDBX_cursor_couple couple; - MDBX_val key, data; - - int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); + int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0) - freecount += *(pgno_t *)data.iov_base; + pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + + read_txn->mt_dbs[FREE_DBI].md_leaf_pages + + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; + MDBX_val key, data; + while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == + MDBX_SUCCESS) { + const MDBX_PNL pnl = data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(pnl) || + !(pnl_check(pnl, read_txn->mt_next_pgno)))) + return MDBX_CORRUPTED; + gc += MDBX_PNL_SIZE(pnl); + } if (unlikely(rc != MDBX_NOTFOUND)) return rc; - freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + - read_txn->mt_dbs[FREE_DBI].md_leaf_pages + - read_txn->mt_dbs[FREE_DBI].md_overflow_pages; - - new_root = read_txn->mt_next_pgno - 1 - freecount; - meta->mm_geo.next = new_root + 1; + /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */ + meta->mm_geo.next = read_txn->mt_next_pgno - gc; + /* Set with current main DB */ meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; - meta->mm_dbs[MAIN_DBI].md_root = new_root; - mdbx_copy ctx; + mdbx_compacting_ctx ctx; memset(&ctx, 0, sizeof(ctx)); - rc = mdbx_condpair_init(&ctx.mc_condpair); + rc = osal_condpair_init(&ctx.mc_condpair); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2); + memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); ctx.mc_wbuf[0] = data_buffer; - ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF)); + ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; ctx.mc_next_pgno = NUM_METAS; ctx.mc_env = env; ctx.mc_fd = fd; ctx.mc_txn = read_txn; - mdbx_thread_t thread; - int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); + osal_thread_t thread; + int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { - compact_fixup_meta(env, meta); - rc = mdbx_write(fd, buffer, meta_bytes); + if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) + meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; + compacting_fixup_meta(env, meta); + rc = osal_write(fd, buffer, meta_bytes); } - if (rc == MDBX_SUCCESS) - rc = mdbx_env_cwalk(&ctx, &root, 0); - mdbx_env_cthr_toggle(&ctx); - mdbx_env_cthr_toggle(&ctx); - thread_err = mdbx_thread_join(thread); - mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && - ctx.mc_wlen[ctx.mc_head & 1] == 0) || - ctx.mc_error); - mdbx_condpair_destroy(&ctx.mc_condpair); + if (likely(rc == MDBX_SUCCESS)) + rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); + if (ctx.mc_wlen[ctx.mc_head & 1]) + /* toggle to flush non-empty buffers */ + compacting_toggle_write_buffers(&ctx); + + if (likely(rc == MDBX_SUCCESS) && + unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { + if (ctx.mc_next_pgno > meta->mm_geo.next) { + ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has double-used pages or other corruption", ctx.mc_next_pgno, + '>', meta->mm_geo.next); + rc = MDBX_CORRUPTED; /* corrupted DB */ + } + if (ctx.mc_next_pgno < meta->mm_geo.next) { + WARNING( + "the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); + if (dest_is_pipe) + /* the root within already written meta-pages is wrong */ + rc = MDBX_CORRUPTED; + } + /* fixup meta */ + meta->mm_geo.next = ctx.mc_next_pgno; + } + + /* toggle with empty buffers to exit thread's loop */ + eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); + compacting_toggle_write_buffers(&ctx); + thread_err = osal_thread_join(thread); + eASSERT(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); + osal_condpair_destroy(&ctx.mc_condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) return thread_err; @@ -23004,49 +23374,24 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, return rc; if (unlikely(ctx.mc_error != MDBX_SUCCESS)) return ctx.mc_error; - - if (dest_is_pipe) { - if (unlikely(root != new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " NE expected %" PRIaPGNO - " (source DB corrupted or has a page leak(s))", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - } else { - if (unlikely(root > new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " GT expected %" PRIaPGNO " (source DB corrupted)", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - if (unlikely(root < new_root)) { - mdbx_warning("post-compactification root %" PRIaPGNO - " LT expected %" PRIaPGNO " (page leak(s) in source DB)", - root, new_root); - /* fixup meta */ - meta->mm_dbs[MAIN_DBI].md_root = root; - meta->mm_geo.next = root + 1; - } - compact_fixup_meta(env, meta); - } + if (!dest_is_pipe) + compacting_fixup_meta(env, meta); } /* Extend file if required */ if (meta->mm_geo.now != meta->mm_geo.next) { const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); if (!dest_is_pipe) - return mdbx_ftruncate(fd, whole_size); + return osal_ftruncate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->mm_geo.next); - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; offset < whole_size;) { - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : whole_size - offset; + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ - int rc = mdbx_write(fd, data_buffer, chunk); + int rc = osal_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) return rc; offset += chunk; @@ -23056,11 +23401,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* Copy environment as-is. */ -__cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { /* We must start the actual read txn after blocking writers */ - int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); + int rc = txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23069,33 +23414,35 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_txn_renew0(read_txn, MDBX_TXN_RDONLY); + rc = txn_renew(read_txn, MDBX_TXN_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); return rc; } - mdbx_jitter4testing(false); + jitter4testing(false); const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + const meta_troika_t troika = meta_tap(env); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - (MDBX_meta *)(buffer + ((uint8_t *)meta_prefer_last(env) - env->me_map)); + (MDBX_meta *)(buffer + + ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_sizeable(headcopy); + meta_make_sizeable(headcopy); /* Update signature to steady */ - unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy)); + unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy)); /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); - mdbx_jitter4testing(false); + jitter4testing(false); if (dest_is_pipe) - rc = mdbx_write(fd, buffer, meta_bytes); + rc = osal_write(fd, buffer, meta_bytes); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); @@ -23145,30 +23492,29 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, #endif /* MDBX_USE_COPYFILERANGE */ /* fallback to portable */ - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : used_size - offset; + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ memcpy(data_buffer, env->me_map + offset, chunk); - rc = mdbx_write(fd, data_buffer, chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } /* Extend file if required */ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { if (!dest_is_pipe) - rc = mdbx_ftruncate(fd, whole_size); + rc = osal_ftruncate(fd, whole_size); else { - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) { const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ - rc = mdbx_write(fd, data_buffer, chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } } @@ -23183,12 +23529,12 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const int dest_is_pipe = mdbx_is_pipe(fd); + const int dest_is_pipe = osal_is_pipe(fd); if (MDBX_IS_ERROR(dest_is_pipe)) return dest_is_pipe; if (!dest_is_pipe) { - rc = mdbx_fseek(fd, 0); + rc = osal_fseek(fd, 0); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -23196,12 +23542,12 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, const size_t buffer_size = pgno_align2os_bytes(env, NUM_METAS) + ceil_powerof2(((flags & MDBX_CP_COMPACT) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2 - : ((size_t)(MDBX_ENVCOPY_WRITEBUF))), + ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF + : (size_t)MDBX_ENVCOPY_WRITEBUF), env->me_os_psize); uint8_t *buffer = NULL; - rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23210,7 +23556,7 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, * write txn. Otherwise other read txns could block writers. */ rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } @@ -23218,34 +23564,44 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, /* Firstly write a stub to meta-pages. * Now we sure to incomplete copy will not be used. */ memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); } if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); - rc = ((flags & MDBX_CP_COMPACT) ? mdbx_env_compact : mdbx_env_copy_asis)( + rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)( env, read_txn, fd, buffer, dest_is_pipe, flags); } mdbx_txn_abort(read_txn); if (!dest_is_pipe) { if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *dest_pathW = nullptr; + OSAL_MB2WIDE(dest_path, dest_pathW); + return mdbx_env_copyW(env, dest_pathW, flags); +} + +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, + MDBX_copy_flags_t flags) { +#endif /* Windows */ + int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23257,7 +23613,7 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ mdbx_filehandle_t newfd; - rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, + rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, #if defined(_WIN32) || defined(_WIN64) (mdbx_mode_t)-1 #else @@ -23294,11 +23650,11 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, rc = mdbx_env_copy2fd(env, newfd, flags); if (newfd != INVALID_HANDLE_VALUE) { - int err = mdbx_closefile(newfd); + int err = osal_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; if (rc != MDBX_SUCCESS) - (void)mdbx_removefile(dest_path); + (void)osal_removefile(dest_path); } return rc; @@ -23321,11 +23677,11 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, return MDBX_EACCESS; if ((env->me_flags & MDBX_ENV_ACTIVE) && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) + unlikely(env->me_txn0->mt_owner == osal_thread_self())) return MDBX_BUSY; const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && - env->me_txn0->mt_owner != mdbx_thread_self(); + env->me_txn0->mt_owner != osal_thread_self(); bool should_unlock = false; if (lock_needed) { rc = mdbx_txn_lock(env, false); @@ -23383,6 +23739,7 @@ __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } +#if !(defined(_WIN32) || defined(_WIN64)) __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -23394,6 +23751,19 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { *arg = env->me_pathname; return MDBX_SUCCESS; } +#else +__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + + *arg = env->me_pathname; + return MDBX_SUCCESS; +} +#endif /* Windows */ __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env, true); @@ -23459,12 +23829,12 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { MDBX_cursor_couple cx; - err = mdbx_cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); if (unlikely(err != MDBX_SUCCESS)) return err; /* scan and account not opened named subDBs */ - err = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (err == MDBX_SUCCESS) { const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; for (unsigned i = 0; i < page_numkeys(mp); i++) { @@ -23490,7 +23860,7 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { stat_add(&db, st, bytes); } } - err = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + err = cursor_sibling(&cx.outer, SIBLING_RIGHT); } if (unlikely(err != MDBX_NOTFOUND)) return err; @@ -23517,7 +23887,7 @@ __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(err != MDBX_SUCCESS)) return err; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) /* inside write-txn */ return stat_acc(env->me_txn, dest, bytes); @@ -23546,14 +23916,14 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) return MDBX_RESULT_TRUE; MDBX_val key, data; - rc = mdbx_cursor_first(&cx.outer, &key, &data); + rc = cursor_first(&cx.outer, &key, &data); *mask = 0; while (rc == MDBX_SUCCESS) { const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], @@ -23575,10 +23945,10 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); break; default: - mdbx_error("wrong node-flags %u", flags); + ERROR("wrong node-flags %u", flags); return MDBX_CORRUPTED; } - rc = mdbx_cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); } return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -23629,21 +23999,30 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); - arg->mi_recent_txnid = meta_txnid(env, recent_meta); - arg->mi_meta0_txnid = meta_txnid(env, meta0); - arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign); - arg->mi_meta1_txnid = meta_txnid(env, meta1); - arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign); - arg->mi_meta2_txnid = meta_txnid(env, meta2); - arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign); + meta_troika_t holder; + meta_troika_t const *troika; + if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) + troika = &txn->tw.troika; + else { + holder = meta_tap(env); + troika = &holder; + } + + const meta_ptr_t head = meta_recent(env, troika); + arg->mi_recent_txnid = head.txnid; + arg->mi_meta0_txnid = troika->txnid[0]; + arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); + arg->mi_meta1_txnid = troika->txnid[1]; + arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); + arg->mi_meta2_txnid = troika->txnid[2]; + arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); } - volatile const MDBX_meta *txn_meta = recent_meta; + const volatile MDBX_meta *txn_meta = head.ptr_v; arg->mi_last_pgno = txn_meta->mm_geo.next - 1; arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); if (txn) { @@ -23678,16 +24057,16 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (likely(bytes > size_before_bootid)) { arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); - const uint64_t monotime_now = mdbx_osal_monotime(); + const uint64_t monotime_now = osal_monotime(); uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); arg->mi_since_reader_check_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; arg->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + arg->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16( atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; @@ -23711,6 +24090,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( + atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ @@ -23732,7 +24113,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, } } - mdbx_compiler_barrier(); + osal_compiler_barrier(); return MDBX_SUCCESS; } @@ -23794,8 +24175,8 @@ static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { +static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { /* LY: so, accepting only three cases for the table's flags: * 1) user_flags and both comparators are zero * = assume that a by-default mode/flags is requested for reading; @@ -23883,7 +24264,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* main table? */ if (!table_name) { - rc = mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; *dbi = MAIN_DBI; @@ -23912,7 +24293,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } if (len == txn->mt_dbxs[scan].md_name.iov_len && !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; *dbi = scan; @@ -23938,10 +24319,10 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, key.iov_len = len; key.iov_base = (void *)table_name; MDBX_cursor_couple couple; - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; - rc = mdbx_cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) goto early_bailout; @@ -23953,7 +24334,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, rc = MDBX_INCOMPATIBLE; goto early_bailout; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(data.iov_len != sizeof(MDBX_db))) { + if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; goto early_bailout; } @@ -23965,16 +24346,16 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Done here so we cannot fail after creating a new DB */ - char *namedup = mdbx_strdup(table_name); + char *namedup = osal_strdup(table_name); if (unlikely(!namedup)) { rc = MDBX_ENOMEM; goto early_bailout; } - int err = mdbx_fastmutex_acquire(&env->me_dbi_lock); + int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - mdbx_free(namedup); + osal_free(namedup); goto early_bailout; } @@ -23990,7 +24371,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } if (len == txn->mt_dbxs[scan].md_name.iov_len && !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto later_bailout; *dbi = scan; @@ -24007,7 +24388,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_mod_txnid = txn->mt_txnid; @@ -24023,38 +24404,41 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, dbiflags |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; - mdbx_tassert(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); + tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); } /* Got info, register DBI in this txn */ memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); env->me_dbflags[slot] = 0; - rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_tassert(txn, (dbiflags & DBI_CREAT) == 0); + tASSERT(txn, (dbiflags & DBI_CREAT) == 0); later_bailout: *dbi = 0; later_exit: - mdbx_free(namedup); + osal_free(namedup); } else { txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; txn->mt_dbxs[slot].md_name.iov_len = len; - txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot] = dbi_seq(env, slot); + txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = + dbi_seq(env, slot); if (!(dbiflags & DBI_CREAT)) env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; if (txn->mt_numdbs == slot) { - mdbx_compiler_barrier(); - txn->mt_numdbs = slot + 1; txn->mt_cursors[slot] = NULL; + osal_compiler_barrier(); + txn->mt_numdbs = slot + 1; } - if (env->me_numdbs <= slot) + if (env->me_numdbs <= slot) { + osal_memory_fence(mo_AcquireRelease, true); env->me_numdbs = slot + 1; + } *dbi = slot; } - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); return rc; } @@ -24089,7 +24473,7 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_BAD_TXN; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -24099,8 +24483,8 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_SUCCESS; } -static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { - mdbx_assert(env, dbi >= CORE_DBS); +static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { + eASSERT(env, dbi >= CORE_DBS); if (unlikely(dbi >= env->me_numdbs)) return MDBX_BAD_DBI; @@ -24111,9 +24495,9 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { env->me_dbflags[dbi] = 0; env->me_dbxs[dbi].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); env->me_dbxs[dbi].md_name.iov_base = NULL; - mdbx_free(ptr); + osal_free(ptr); if (env->me_numdbs == dbi + 1) { unsigned i = env->me_numdbs; @@ -24134,12 +24518,12 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_BAD_DBI; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (likely(rc == MDBX_SUCCESS)) { rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) - ? mdbx_dbi_close_locked(env, dbi) + ? dbi_close_locked(env, dbi) : MDBX_BAD_DBI; - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } return rc; } @@ -24169,21 +24553,21 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { - int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); +static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { + int rc = page_search(mc, NULL, MDBX_PS_FIRST); if (likely(rc == MDBX_SUCCESS)) { MDBX_txn *txn = mc->mc_txn; /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no overflow + * Also if the DB doesn't have sub-DBs and has no large/overflow * pages, omit scanning leaves. */ if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) - mdbx_cursor_pop(mc); + cursor_pop(mc); - rc = mdbx_pnl_need(&txn->tw.retired_pages, - mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages + - mc->mc_db->md_overflow_pages); + rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages + + mc->mc_db->md_leaf_pages + + mc->mc_db->md_overflow_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -24193,11 +24577,11 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { MDBX_page *const mp = mc->mc_pg[mc->mc_top]; const unsigned nkeys = page_numkeys(mp); if (IS_LEAF(mp)) { - mdbx_cassert(mc, mc->mc_snum == mc->mc_db->md_depth); + cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); for (unsigned i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { - rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), NULL, 0); + rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) @@ -24207,45 +24591,43 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; goto bailout; } - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } else { - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); - if (mdbx_audit_enabled()) - mc->mc_flags |= C_RETIRING; - const int pagetype = + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); + mc->mc_checking |= CC_RETIRING; + const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); for (unsigned i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); - mdbx_tassert(txn, (node_flags(node) & - (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + tASSERT(txn, (node_flags(node) & + (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); const pgno_t pgno = node_pgno(node); - rc = mdbx_page_retire_ex(mc, pgno, NULL, pagetype); + rc = page_retire_ex(mc, pgno, nullptr, pagetype); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (mdbx_audit_enabled()) - mc->mc_flags -= C_RETIRING; + mc->mc_checking -= CC_RETIRING; } if (!mc->mc_top) break; - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + rc = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_NOTFOUND)) goto bailout; /* no more siblings, go back to beginning * of previous level. */ pop: - mdbx_cursor_pop(mc); + cursor_pop(mc); mc->mc_ki[0] = 0; for (unsigned i = 1; i < mc->mc_snum; i++) { mc->mc_ki[i] = 0; @@ -24253,7 +24635,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } } } - rc = mdbx_page_retire(mc, mc->mc_pg[0]); + rc = page_retire(mc, mc->mc_pg[0]); bailout: if (unlikely(rc != MDBX_SUCCESS)) txn->mt_flags |= MDBX_TXN_ERROR; @@ -24274,8 +24656,8 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_drop_tree(mc, dbi == MAIN_DBI || - (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); + rc = drop_tree(mc, + dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); /* Invalidate the dropped DB's cursors */ for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -24284,20 +24666,19 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { - rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); - mdbx_tassert(txn, txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); txn->mt_dbistate[dbi] = DBI_STALE; MDBX_env *env = txn->mt_env; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; goto bailout; } - mdbx_dbi_close_locked(env, dbi); - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + dbi_close_locked(env, dbi); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } else { txn->mt_flags |= MDBX_TXN_ERROR; } @@ -24311,7 +24692,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; txn->mt_dbs[dbi].md_seq = 0; - /* txn->mt_dbs[dbi].md_mod_txnid = txn->mt_txnid; */ txn->mt_flags |= MDBX_TXN_DIRTY; } @@ -24381,7 +24761,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) goto retry_reader; - mdbx_assert(env, txnid > 0); + eASSERT(env, txnid > 0); if (txnid >= SAFE64_INVALID_THRESHOLD) txnid = 0; @@ -24389,20 +24769,18 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, size_t bytes_retained = 0; uint64_t lag = 0; if (txnid) { + meta_troika_t troika = meta_tap(env); retry_header:; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &troika); const uint64_t head_pages_retired = - unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); - const txnid_t head_txnid = meta_txnid(env, recent_meta); - mdbx_compiler_barrier(); - if (unlikely(recent_meta != meta_prefer_last(env) || + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + if (unlikely(meta_should_retry(env, &troika) || head_pages_retired != unaligned_peek_u64_volatile( - 4, recent_meta->mm_pages_retired)) || - head_txnid != meta_txnid(env, recent_meta)) + 4, head.ptr_v->mm_pages_retired))) goto retry_header; - lag = (head_txnid - txnid) / xMDBX_TXNID_STEP; + lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; bytes_used = pgno2bytes(env, pages_used); bytes_retained = (head_pages_retired > reader_pages_retired) ? pgno2bytes(env, (pgno_t)(head_pages_retired - @@ -24421,7 +24799,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, /* Insert pid into list if not already present. * return -1 if already present. */ -__cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { +__cold static bool pid_insert(uint32_t *ids, uint32_t pid) { /* binary search of pid in list */ unsigned base = 0; unsigned cursor = 1; @@ -24457,20 +24835,20 @@ __cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { __cold int mdbx_reader_check(MDBX_env *env, int *dead) { if (dead) *dead = 0; - return mdbx_cleanup_dead_readers(env, false, dead); + return cleanup_dead_readers(env, false, dead); } /* Return: * MDBX_RESULT_TRUE - done and mutex recovered * MDBX_SUCCESS - done * Otherwise errcode. */ -__cold MDBX_INTERNAL_FUNC int -mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { +__cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, + int rdt_locked, int *dead) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_assert(env, rdt_locked >= 0); + eASSERT(env, rdt_locked >= 0); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL)) { /* exclusive mode */ @@ -24485,7 +24863,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) ? pidsbuf_onstask - : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t)); + : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); if (unlikely(!pids)) return MDBX_ENOMEM; @@ -24498,21 +24876,21 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { continue /* skip empty */; if (pid == env->me_pid) continue /* skip self */; - if (!mdbx_pid_insert(pids, pid)) + if (!pid_insert(pids, pid)) continue /* such pid already processed */; - int err = mdbx_rpid_check(env, pid); + int err = osal_rpid_check(env, pid); if (err == MDBX_RESULT_TRUE) continue /* reader is live */; if (err != MDBX_SUCCESS) { rc = err; - break /* mdbx_rpid_check() failed */; + break /* osal_rpid_check() failed */; } /* stale reader found */ if (!rdt_locked) { - err = mdbx_rdt_lock(env); + err = osal_rdt_lock(env); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -24529,7 +24907,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { if (lck->mti_readers[i].mr_pid.weak != pid) continue; - err = mdbx_rpid_check(env, pid); + err = osal_rpid_check(env, pid); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -24542,8 +24920,8 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { if (lck->mti_readers[j].mr_pid.weak == pid) { - mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, - (size_t)pid, lck->mti_readers[j].mr_txnid.weak); + DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, + lck->mti_readers[j].mr_txnid.weak); atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); count++; @@ -24552,25 +24930,25 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { } if (likely(!MDBX_IS_ERROR(rc))) - atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(), + atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(), mo_Relaxed); if (rdt_locked < 0) - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (pids != pidsbuf_onstask) - mdbx_free(pids); + osal_free(pids); if (dead) *dead = count; return rc; } -__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { - const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); +__cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { + const int rc = runtime_flags | (loglevel << 16); - if (loglevel != MDBX_LOG_DONTCHANGE) - mdbx_loglevel = (uint8_t)loglevel; + if (level != MDBX_LOG_DONTCHANGE) + loglevel = (uint8_t)level; if (flags != MDBX_DBG_DONTCHANGE) { flags &= @@ -24579,111 +24957,103 @@ __cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { #endif MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE; - mdbx_runtime_flags = (uint8_t)flags; + runtime_flags = (uint8_t)flags; } if (logger != MDBX_LOGGER_DONTCHANGE) - mdbx_debug_logger = logger; + debug_logger = logger; return rc; } -__cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard) { - mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard); +__cold static txnid_t kick_longlived_readers(MDBX_env *env, + const txnid_t laggard) { + DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); + osal_memory_fence(mo_AcquireRelease, false); + MDBX_hsr_func *const callback = env->me_hsr_callback; + txnid_t oldest = 0; + bool notify_eof_of_loop = false; + int retry = 0; + do { + const txnid_t steady = + env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady]; + env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; + oldest = find_oldest_reader(env, steady); + eASSERT(env, oldest < env->me_txn0->mt_txnid); + eASSERT(env, oldest >= laggard); + eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); - int retry; - for (retry = 0; retry < INT_MAX; ++retry) { - txnid_t oldest = mdbx_recent_steady_txnid(env); - mdbx_assert(env, oldest < env->me_txn0->mt_txnid); - mdbx_assert(env, oldest >= laggard); - mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (oldest == laggard || unlikely(!lck /* without-LCK mode */)) - return oldest; - - if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL))) + if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) break; - MDBX_reader *asleep = nullptr; - uint64_t oldest_retired = UINT64_MAX; - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const uint64_t snap_retired = atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); - const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely(snap_retired != - atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, - mo_AcquireRelease) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) - goto retry; - if (oldest > snap_txnid && - laggard <= /* ignore pending updates */ snap_txnid) { - oldest = snap_txnid; - oldest_retired = snap_retired; - asleep = &lck->mti_readers[i]; - } + if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL))) + break; + + if (!callback) + break; + + MDBX_reader *stucked = nullptr; + uint64_t hold_retired = 0; + for (unsigned i = 0; i < lck->mti_numreaders.weak; ++i) { + const uint64_t snap_retired = atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (rtxn == laggard && + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + hold_retired = snap_retired; + stucked = &lck->mti_readers[i]; } } - if (laggard < oldest || !asleep) { - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - const txnid_t gap = oldest - laggard; - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0, - -retry); - } - mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, - lck->mti_oldest_reader.weak, oldest); - mdbx_assert(env, lck->mti_oldest_reader.weak <= oldest); - return atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); - } - - if (!env->me_hsr_callback) + if (!stucked) break; - uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease); - uint64_t tid = asleep->mr_tid.weak; - if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) + uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease); + uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease); + if (safe64_read(&stucked->mr_txnid) != laggard || !pid || + stucked->mr_snapshot_pages_retired.weak != hold_retired) continue; - const MDBX_meta *head_meta = constmeta_prefer_last(env); - const txnid_t gap = - (constmeta_txnid(env, head_meta) - laggard) / xMDBX_TXNID_STEP; + const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika); + const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; const uint64_t head_retired = - unaligned_peek_u64(4, head_meta->mm_pages_retired); + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); const size_t space = - (head_retired > oldest_retired) - ? pgno2bytes(env, (pgno_t)(head_retired - oldest_retired)) + (head_retired > hold_retired) + ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0; - int rc = env->me_hsr_callback( - env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); + int rc = + callback(env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); if (rc < 0) + /* hsr returned error and/or agree MDBX_MAP_FULL error */ break; if (rc > 0) { if (rc == 1) { - safe64_reset_compare(&asleep->mr_txnid, laggard); + /* hsr reported transaction (will be) aborted asynchronous */ + safe64_reset_compare(&stucked->mr_txnid, laggard); } else { - safe64_reset(&asleep->mr_txnid, true); - atomic_store64(&asleep->mr_tid, 0, mo_Relaxed); - atomic_store32(&asleep->mr_pid, 0, mo_Relaxed); + /* hsr reported reader process was killed and slot should be cleared */ + safe64_reset(&stucked->mr_txnid, true); + atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); + atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); } - atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed); - } - } + } else + notify_eof_of_loop = true; - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry); + } while (++retry < INT_MAX); + + if (notify_eof_of_loop) { + /* notify end of hsr-loop */ + const txnid_t turn = oldest - laggard; + if (turn) + NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, + laggard, oldest, turn); + callback(env, env->me_txn, 0, 0, laggard, + (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } - return mdbx_find_oldest(env->me_txn); + return oldest; } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -24730,18 +25100,18 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) return 0; } - txnid_t recent = 0; - volatile const MDBX_meta *meta = nullptr; + txnid_t lag; + meta_troika_t troika = meta_tap(env); do { - meta = meta_prefer_last(env); - recent = meta_txnid(env, meta); + const meta_ptr_t head = meta_recent(env, &troika); if (percent) { - const pgno_t maxpg = meta->mm_geo.now; - *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); + const pgno_t maxpg = head.ptr_v->mm_geo.now; + *percent = + (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } - } while (unlikely(recent != meta_txnid(env, meta))); + lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; + } while (unlikely(meta_should_retry(env, &troika))); - txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -24753,8 +25123,8 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, - const char *name, int deep); +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const char *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) @@ -24774,47 +25144,25 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { } /* Depth-first tree traversal. */ -__cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const char *name, int deep, - txnid_t parent_txnid) { +__cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, + const char *name, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; - int rc, err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, mp, 0); + int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); MDBX_page_type_t type = walk_page_type(mp); - const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1; - unsigned npages = (mp && IS_OVERFLOW(mp)) ? mp->mp_pages : 1; + const unsigned nentries = mp ? page_numkeys(mp) : 0; + unsigned npages = 1; size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); - size_t header_size = (mp && !IS_LEAF2(mp) && !IS_OVERFLOW(mp)) - ? PAGEHDRSZ + mp->mp_lower - : PAGEHDRSZ; + size_t header_size = + (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ; size_t payload_size = 0; size_t unused_size = - (mp && !IS_OVERFLOW(mp) ? page_room(mp) : pagesize - header_size) - - payload_size; + (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - switch (mp->mp_flags) { - default: - err = MDBX_CORRUPTED; - break; - case P_BRANCH: - if (unlikely(nentries < 2)) - err = MDBX_CORRUPTED; - case P_LEAF: - case P_LEAF | P_LEAF2: - break; - } - } - - for (int i = 0; err == MDBX_SUCCESS && i < nentries; - align_bytes += ((payload_size + align_bytes) & 1), i++) { + for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; + align_bytes += ((payload_size + align_bytes) & 1), ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ payload_size += mp->mp_leaf2_ksize; @@ -24842,26 +25190,19 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, const size_t over_header = PAGEHDRSZ; npages = 1; - MDBX_page *op; - err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, - pp_txnid4chk(mp, ctx->mw_txn)); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, op, 0); + assert(err == MDBX_SUCCESS); + pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); + err = lp.err; if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - if (unlikely(P_OVERFLOW != op->mp_flags)) - err = bad_page(mp, "wrong page type %d for large data", op->mp_flags); - else - npages = op->mp_pages; + cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + npages = lp.page->mp_pages; } pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); const size_t over_unused = pagesize - over_payload - over_header; - rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, name, - pagesize, MDBX_page_large, err, 1, over_payload, - over_header, over_unused); + const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, + name, pagesize, MDBX_page_large, err, 1, + over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; } break; @@ -24869,24 +25210,29 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: { const size_t namelen = node_ks(node); payload_size += node_ds(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: payload_size += sizeof(MDBX_db); - if (unlikely(node_ds(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } break; case F_DUPDATA /* short sub-page */: { if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; } MDBX_page *sp = node_data(node); - const int nsubkeys = page_numkeys(sp); + const unsigned nsubkeys = page_numkeys(sp); size_t subheader_size = IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; size_t subunused_size = page_room(sp); @@ -24894,7 +25240,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, size_t subalign_bytes = 0; MDBX_page_type_t subtype; - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: subtype = MDBX_subpage_leaf; break; @@ -24902,12 +25248,13 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subtype = MDBX_subpage_dupfixed_leaf; break; default: + assert(err == MDBX_CORRUPTED); subtype = MDBX_subpage_broken; err = MDBX_CORRUPTED; } - for (int j = 0; err == MDBX_SUCCESS && j < nsubkeys; - subalign_bytes += ((subpayload_size + subalign_bytes) & 1), j++) { + for (unsigned j = 0; err == MDBX_SUCCESS && j < nsubkeys; + subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -24916,14 +25263,17 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(subtype == MDBX_subpage_leaf); MDBX_node *subnode = page_node(sp, j); subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); - if (unlikely(node_flags(subnode) != 0)) + if (unlikely(node_flags(subnode) != 0)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } } - rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), - subtype, err, nsubkeys, subpayload_size, - subheader_size, subunused_size + subalign_bytes); + const int rc = + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + subtype, err, nsubkeys, subpayload_size, + subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; header_size += subheader_size; @@ -24933,24 +25283,25 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; default: + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } } - rc = ctx->mw_visitor(pgno, 1, ctx->mw_user, deep, name, - ctx->mw_txn->mt_env->me_psize, type, err, nentries, - payload_size, header_size, unused_size + align_bytes); + const int rc = ctx->mw_visitor( + pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, + err, nentries, payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - for (int i = 0; err == MDBX_SUCCESS && i < nentries; i++) { + for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) continue; MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { - err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); + assert(err == MDBX_SUCCESS); + err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -24968,6 +25319,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: { const size_t namelen = node_ks(node); if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; } @@ -24975,35 +25327,38 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, char namebuf_onstask[64]; char *const sub_name = (namelen < sizeof(namebuf_onstask)) ? namebuf_onstask - : mdbx_malloc(namelen + 1); - if (sub_name) { - memcpy(sub_name, node_key(node), namelen); - sub_name[namelen] = 0; - memcpy(&db, node_data(node), sizeof(db)); - err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); - if (sub_name != namebuf_onstask) - mdbx_free(sub_name); - } else { - err = MDBX_ENOMEM; - } + : osal_malloc(namelen + 1); + if (unlikely(!sub_name)) + return MDBX_ENOMEM; + memcpy(sub_name, node_key(node), namelen); + sub_name[namelen] = 0; + memcpy(&db, node_data(node), sizeof(db)); + assert(err == MDBX_SUCCESS); + err = walk_sdb(ctx, &db, sub_name, deep + 1); + if (sub_name != namebuf_onstask) + osal_free(sub_name); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_ds(node) != sizeof(MDBX_db) || - ctx->mw_cursor->mc_xcursor == NULL)) + ctx->mw_cursor->mc_xcursor == NULL)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - else { + } else { memcpy(&db, node_data(node), sizeof(db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); - ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); - MDBX_xcursor *inner_xcursor = - container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); - MDBX_cursor_couple *couple = - container_of(inner_xcursor, MDBX_cursor_couple, inner); - ctx->mw_cursor = &couple->outer; + assert(err == MDBX_SUCCESS); + err = cursor_xinit1(ctx->mw_cursor, node, mp); + if (likely(err == MDBX_SUCCESS)) { + ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; + err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + MDBX_xcursor *inner_xcursor = + container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); + MDBX_cursor_couple *couple = + container_of(inner_xcursor, MDBX_cursor_couple, inner); + ctx->mw_cursor = &couple->outer; + } } break; } @@ -25012,25 +25367,28 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, - const char *name, int deep) { - if (unlikely(db->md_root == P_INVALID)) +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const char *name, int deep) { + if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbistate); + int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (ctx->mw_dont_check_keys_ordering) { - couple.outer.mc_flags |= C_SKIPORD; - couple.inner.mx_cursor.mc_flags |= C_SKIPORD; - } + couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = mdbx_walk_tree(ctx, db->md_root, name, deep, ctx->mw_txn->mt_txnid); + rc = walk_tree(ctx, sdb->md_root, name, deep, + sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -25054,9 +25412,9 @@ __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * NUM_METAS); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); return rc; } @@ -25184,7 +25542,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { - mdbx_error("Mismatch cursors's pages at %u level", r->level); + ERROR("Mismatch cursors's pages at %u level", r->level); return MDBX_PROBLEM; } @@ -25349,7 +25707,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, cursor_copy(cursor, &next.outer); if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; - rc = mdbx_xcursor_init0(&next.outer); + rc = cursor_xinit0(&next.outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; @@ -25407,7 +25765,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ - rc = mdbx_cursor_init(&begin.outer, txn, dbi); + rc = cursor_init(&begin.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -25423,7 +25781,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, return MDBX_SUCCESS; } MDBX_val stub = {0, 0}; - rc = mdbx_cursor_first(&begin.outer, &stub, &stub); + rc = cursor_first(&begin.outer, &stub, &stub); if (unlikely(end_key == MDBX_EPSILON)) { /* LY: FIRST..+epsilon case */ return (rc == MDBX_SUCCESS) @@ -25435,7 +25793,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (end_key == NULL) { /* LY: -epsilon..LAST case */ MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&begin.outer, &stub, &stub); + rc = cursor_last(&begin.outer, &stub, &stub); return (rc == MDBX_SUCCESS) ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) : rc; @@ -25452,7 +25810,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, (begin_key == end_key || begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { /* LY: single key case */ - rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; + rc = cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { *size_items = 0; return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -25461,10 +25819,9 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (begin.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], begin.outer.mc_ki[begin.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { /* LY: return the number of duplicates for given key */ - mdbx_tassert(txn, - begin.outer.mc_xcursor == &begin.inner && + tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); *size_items = (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || @@ -25475,8 +25832,8 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } return MDBX_SUCCESS; } else { - rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data, - begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&begin.outer, begin_key, begin_data, + begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } } @@ -25487,15 +25844,15 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } MDBX_cursor_couple end; - rc = mdbx_cursor_init(&end.outer, txn, dbi); + rc = cursor_init(&end.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (!end_key) { MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&end.outer, &stub, &stub); + rc = cursor_last(&end.outer, &stub, &stub); } else { - rc = mdbx_cursor_set(&end.outer, end_key, end_data, - end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&end.outer, end_key, end_data, + end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } if (unlikely(rc != MDBX_SUCCESS)) { @@ -25588,7 +25945,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, return MDBX_EINVAL; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -25629,9 +25986,9 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (flags & MDBX_CURRENT) { /* disallow update/delete for multi-values */ MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + if (node_flags(node) & F_DUPDATA) { + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { rc = MDBX_EMULTIVAL; goto bailout; @@ -25763,7 +26120,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, return MDBX_BAD_DBI; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -25780,7 +26137,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(new < increment)) return MDBX_RESULT_TRUE; - mdbx_tassert(txn, new > dbs->md_seq); + tASSERT(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbistate[dbi] |= DBI_DIRTY; @@ -25858,7 +26215,7 @@ static __always_inline uint64_t double2key(const double *const ptr) { const int64_t i = *(const int64_t *)ptr; const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i : i + UINT64_C(0x8000000000000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const double f = key2double(u); assert(memcmp(&f, ptr, 8) == 0); } @@ -25881,7 +26238,7 @@ static __always_inline uint32_t float2key(const float *const ptr) { const int32_t i = *(const int32_t *)ptr; const uint32_t u = (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const float f = key2float(u); assert(memcmp(&f, ptr, 4) == 0); } @@ -26088,7 +26445,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return err; const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - env->me_txn0->mt_owner != mdbx_thread_self()); + env->me_txn0->mt_owner != osal_thread_self()); bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: @@ -26121,7 +26478,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(value > UINT32_MAX)) return MDBX_TOO_LARGE; if (atomic_store64(&env->me_lck->mti_autosync_period, - mdbx_osal_16dot16_to_monotime((uint32_t)value), + osal_16dot16_to_monotime((uint32_t)value), mo_Relaxed) != 0 && (env->me_flags & MDBX_ENV_ACTIVE)) { err = mdbx_env_sync_poll(env); @@ -26165,13 +26522,13 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { - mdbx_assert(env, env->me_dp_reserve != NULL); + eASSERT(env, env->me_dp_reserve != NULL); MDBX_page *dp = env->me_dp_reserve; MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dp_reserve = dp->mp_next; VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + osal_free(dp); env->me_dp_reserve_len -= 1; } } @@ -26284,7 +26641,7 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, case MDBX_opt_sync_period: if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; - *pvalue = mdbx_osal_monotime_to_16dot16( + *pvalue = osal_monotime_to_16dot16( atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); break; @@ -26336,131 +26693,93 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, return MDBX_SUCCESS; } -/*** Attribute support functions for Nexenta **********************************/ -#ifdef MDBX_NEXENTA_ATTRS - -static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { - if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) - return MDBX_INCOMPATIBLE; - - if (likely(attrptr != NULL)) - *attrptr = *(mdbx_attr_t *)data->iov_base; - data->iov_len -= sizeof(mdbx_attr_t); - data->iov_base = - likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL; - - return MDBX_SUCCESS; -} - -static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - mdbx_attr_t *space = reserved->iov_base; - if (flags & MDBX_RESERVE) { - if (likely(data != NULL)) { - data->iov_base = data->iov_len ? space + 1 : NULL; - } - } else { - *space = attr; - if (likely(data != NULL)) { - memcpy(space + 1, data->iov_base, data->iov_len); - } +__cold void global_ctor(void) { + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_table = rthc_table_static; +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(&rthc_critical_section); + InitializeCriticalSection(&lcklist_critical_section); +#else + ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); + TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); +#endif + /* checking time conversion, this also avoids racing on 32-bit architectures + * during storing calculated 64-bit ratio(s) into memory. */ + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + ENSURE(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; } - return MDBX_SUCCESS; -} + bootid = osal_bootid(); -int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op) { - int rc = mdbx_cursor_get(mc, key, data, op); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +#if MDBX_DEBUG + for (unsigned i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { + const bool s0 = (i >> 0) & 1; + const bool s1 = (i >> 1) & 1; + const bool s2 = (i >> 2) & 1; + const uint8_t c01 = (i / (8 * 1)) % 3; + const uint8_t c02 = (i / (8 * 3)) % 3; + const uint8_t c12 = (i / (8 * 9)) % 3; - return mdbx_attr_peek(data, attrptr); -} + const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); + meta_troika_t troika; + troika.fsm = (uint8_t)i; + meta_troika_unpack(&troika, packed); -int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - uint64_t *attrptr) { - int rc = mdbx_get(txn, dbi, key, data); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const uint8_t tail = TROIKA_TAIL(&troika); + const bool strict = TROIKA_STRICT_VALID(&troika); + const bool valid = TROIKA_VALID(&troika); - return mdbx_attr_peek(data, attrptr); -} + const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady_chk = + meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); -int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); + uint8_t tail_chk; + if (recent_chk == 0) + tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent_chk == 1) + tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; - int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); - - int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr) { - if (unlikely(!key || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(!check_dbi(txn, dbi, DB_USRVALID))) - return MDBX_BAD_DBI; - - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - - MDBX_cursor_couple cx; - MDBX_val old_data; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && data) { - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); - txn->mt_cursors[dbi] = cx.outer.mc_next; - } - return rc; + const bool valid_chk = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + assert(troika.recent == recent_chk); + assert(troika.prefer_steady == prefer_steady_chk); + assert(tail == tail_chk); + assert(valid == valid_chk); + assert(strict == strict_chk); + // printf(" %d, ", packed); + assert(troika_fsm_map[troika.fsm] == packed); } +#endif /* MDBX_DEBUG*/ - mdbx_attr_t old_attr = 0; - rc = mdbx_attr_peek(&old_data, &old_attr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len && - memcmp(data->iov_base, old_data.iov_base, - old_data.iov_len) == 0))) - return MDBX_SUCCESS; - - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, - MDBX_CURRENT); - txn->mt_cursors[dbi] = cx.outer.mc_next; - return rc; +#if 0 /* debug */ + for (unsigned i = 0; i < 65536; ++i) { + size_t pages = pv2pages(i); + unsigned x = pages2pv(pages); + size_t xp = pv2pages(x); + if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) + printf("%u => %zu => %u => %zu\n", i, pages, x, xp); + assert(pages == xp); + } + fflush(stdout); +#endif /* #if 0 */ } -#endif /* MDBX_NEXENTA_ATTRS */ /******************************************************************************/ @@ -26586,6 +26905,9 @@ __dll_export #endif /* MDBX_BUILD_TYPE */ , "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) +#ifdef ENABLE_GPROF + " ENABLE_GPROF" +#endif /* ENABLE_GPROF */ " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) " BYTE_ORDER=" #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -26595,6 +26917,7 @@ __dll_export #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ + " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG @@ -26602,9 +26925,9 @@ __dll_export " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) -#if MDBX_DISABLE_PAGECHECKS - " MDBX_DISABLE_PAGECHECKS=YES" -#endif /* MDBX_DISABLE_PAGECHECKS */ +#if MDBX_DISABLE_VALIDATION + " MDBX_DISABLE_VALIDATION=YES" +#endif /* MDBX_DISABLE_VALIDATION */ #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ @@ -26947,12 +27270,12 @@ __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, (void)env; #endif /* MDBX_DEBUG */ - if (mdbx_debug_logger) - mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); + if (debug_logger) + debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) char *message = nullptr; - const int num = mdbx_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", + const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", msg, func ? func : "unknown", line); if (num < 1 || !message) message = ""; @@ -26976,7 +27299,7 @@ __cold void mdbx_panic(const char *fmt, ...) { va_start(ap, fmt); char *message = nullptr; - const int num = mdbx_vasprintf(&message, fmt, ap); + const int num = osal_vasprintf(&message, fmt, ap); va_end(ap); const char *const const_message = (num < 1 || !message) ? "" @@ -26996,8 +27319,8 @@ __cold void mdbx_panic(const char *fmt, ...) { /*----------------------------------------------------------------------------*/ -#ifndef mdbx_vasprintf -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, +#ifndef osal_vasprintf +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap) { va_list ones; va_copy(ones, ap); @@ -27009,7 +27332,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, return needed; } - *strp = mdbx_malloc(needed + 1); + *strp = osal_malloc(needed + 1); if (unlikely(*strp == nullptr)) { va_end(ones); #if defined(_WIN32) || defined(_WIN64) @@ -27025,25 +27348,25 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, assert(actual == needed); if (unlikely(actual < 0)) { - mdbx_free(*strp); + osal_free(*strp); *strp = nullptr; } return actual; } -#endif /* mdbx_vasprintf */ +#endif /* osal_vasprintf */ -#ifndef mdbx_asprintf -MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) { +#ifndef osal_asprintf +MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - int rc = mdbx_vasprintf(strp, fmt, ap); + int rc = osal_vasprintf(strp, fmt, ap); va_end(ap); return rc; } -#endif /* mdbx_asprintf */ +#endif /* osal_asprintf */ -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result) { assert(is_powerof2(alignment) && alignment >= sizeof(void *)); #if defined(_WIN32) || defined(_WIN64) @@ -27064,35 +27387,35 @@ MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, #error FIXME #endif } -#endif /* mdbx_memalign_alloc */ +#endif /* osal_memalign_alloc */ -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) { +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) { #if defined(_WIN32) || defined(_WIN64) VirtualFree(ptr, 0, MEM_RELEASE); #else - mdbx_free(ptr); + osal_free(ptr); #endif } -#endif /* mdbx_memalign_free */ +#endif /* osal_memalign_free */ -#ifndef mdbx_strdup -char *mdbx_strdup(const char *str) { +#ifndef osal_strdup +char *osal_strdup(const char *str) { if (!str) return NULL; size_t bytes = strlen(str) + 1; - char *dup = mdbx_malloc(bytes); + char *dup = osal_malloc(bytes); if (dup) memcpy(dup, str, bytes); return dup; } -#endif /* mdbx_strdup */ +#endif /* osal_strdup */ /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) { int rc; - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); #if defined(_WIN32) || defined(_WIN64) if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { rc = (int)GetLastError(); @@ -27125,11 +27448,11 @@ bailout_cond: (void)pthread_mutex_destroy(&condpair->mutex); #endif bailout_mutex: - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); @@ -27139,20 +27462,20 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; #endif - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); return waitstatus2errcode(code); #else - return mdbx_pthread_mutex_lock(&condpair->mutex); + return osal_pthread_mutex_lock(&condpair->mutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -27160,7 +27483,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); @@ -27169,7 +27492,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], @@ -27187,7 +27510,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -27196,7 +27519,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) DeleteCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -27205,7 +27528,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) __try { EnterCriticalSection(fastmutex); @@ -27218,11 +27541,11 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { } return MDBX_SUCCESS; #else - return mdbx_pthread_mutex_lock(fastmutex); + return osal_pthread_mutex_lock(fastmutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -27233,15 +27556,28 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - return DeleteFileW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); + +#ifndef WC_ERR_INVALID_CHARS +static const DWORD WC_ERR_INVALID_CHARS = + (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) + ? 0x00000080 + : 0; +#endif /* WC_ERR_INVALID_CHARS */ + +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n) { + return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, + (int)src_n, dst, (int)dst_n); +} + +#endif /* Windows */ + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return unlink(pathname) ? errno : MDBX_SUCCESS; #endif @@ -27251,34 +27587,22 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } #endif /*! Windows */ -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); + return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return rmdir(pathname) ? errno : MDBX_SUCCESS; #endif } -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits) { *fd = INVALID_HANDLE_VALUE; #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; DWORD FlagsAndAttributes = FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; @@ -27323,12 +27647,12 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, break; } - *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, + *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) { int err = (int)GetLastError(); if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { - if (GetFileAttributesW(pathnameW) == INVALID_FILE_ATTRIBUTES && + if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && GetLastError() == ERROR_FILE_NOT_FOUND) err = ERROR_FILE_NOT_FOUND; } @@ -27347,7 +27671,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); if (AttributesDiff) - (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff); + (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); #else int flags = unix_mode_bits ? O_CREAT : 0; @@ -27401,18 +27725,18 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; if (!is_valid_fd(STDIN_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", - STDIN_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); } if (!is_valid_fd(STDOUT_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "OUT", STDOUT_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", + STDOUT_FILENO, dev_null); stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); } if (!is_valid_fd(STDERR_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "ERR", STDERR_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", + STDERR_FILENO, dev_null); stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); } #else @@ -27437,20 +27761,20 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", - STDIN_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); assert(stub_fd0 == -1); *fd = dup(stub_fd0 = *fd); } if (*fd == STDOUT_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", - STDOUT_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); assert(stub_fd1 == -1); *fd = dup(stub_fd1 = *fd); } if (*fd == STDERR_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", - STDERR_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); assert(stub_fd2 == -1); *fd = dup(stub_fd2 = *fd); } @@ -27461,10 +27785,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, if (stub_fd2 != -1) close(stub_fd2); if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { - mdbx_error( - "Rejecting the use of a FD in the range " - "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", - STDIN_FILENO, STDERR_FILENO); + ERROR("Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); close(*fd); return EBADF; } @@ -27491,7 +27814,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -27500,7 +27823,7 @@ MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; @@ -27527,7 +27850,7 @@ MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; } -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, uint64_t offset) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -27563,7 +27886,7 @@ MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, } } -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -27593,13 +27916,13 @@ MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, } } -int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written) { #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ (defined(__ANDROID_API__) && __ANDROID_API__ < 24) size_t written = 0; for (int i = 0; i < iovcnt; ++i) { - int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; @@ -27622,8 +27945,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - enum mdbx_syncmode_bits mode_bits) { +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + enum osal_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) return (int)GetLastError(); @@ -27652,7 +27975,7 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, break /* error */; #if defined(__linux__) || defined(__gnu_linux__) case MDBX_SYNC_SIZE: - if (mdbx_linux_kernel_version >= 0x03060000) + if (linux_kernel_version >= 0x03060000) return MDBX_SUCCESS; __fallthrough /* fall through */; #endif /* Linux */ @@ -27669,7 +27992,7 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, #endif } -int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { +int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) @@ -27688,7 +28011,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) switch (GetFileType(fd)) { case FILE_TYPE_DISK: @@ -27719,7 +28042,7 @@ MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_SetFileInformationByHandle) { FILE_END_OF_FILE_INFO EndOfFileInfo; @@ -27743,7 +28066,7 @@ MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; li.QuadPart = pos; @@ -27759,7 +28082,7 @@ MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { /*----------------------------------------------------------------------------*/ MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg) { #if defined(_WIN32) || defined(_WIN64) @@ -27770,7 +28093,7 @@ mdbx_thread_create(mdbx_thread_t *thread, #endif } -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(thread, INFINITE); return waitstatus2errcode(code); @@ -27782,16 +28105,16 @@ MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits) { + enum osal_syncmode_bits mode_bits) { uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300) + if (mode_bits == MDBX_SYNC_NONE && linux_kernel_version > 0x02061300) /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly * tracks dirty pages and flushes them to storage as necessary. */ return MDBX_SUCCESS; @@ -27800,11 +28123,12 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, return errno; mode_bits &= ~MDBX_SYNC_DATA; #endif - return mdbx_fsync(map->fd, mode_bits); + return osal_fsync(map->fd, mode_bits); } -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err) { +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err) { #if defined(_WIN32) || defined(_WIN64) (void)pathname; (void)err; @@ -27832,7 +28156,7 @@ MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, return MDBX_SUCCESS; } -static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { +static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) return ERROR_NOT_CAPABLE /* workaround for Wine */; @@ -27879,7 +28203,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { - WCHAR *PathBuffer = mdbx_malloc(sizeof(WCHAR) * INT16_MAX); + WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); if (!PathBuffer) return MDBX_ENOMEM; @@ -27947,7 +28271,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } bailout: - mdbx_free(PathBuffer); + osal_free(PathBuffer); return rc; } @@ -28124,11 +28448,10 @@ static int check_mmap_limit(const size_t limit) { const int log2page = log2n_powerof2(pagesize); if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { - mdbx_error( - "%s (%zu pages) is too large for available (%zu pages) or total " - "(%zu pages) system RAM", - "database upper size limit", limit >> log2page, avail_ram_pages, - total_ram_pages); + ERROR("%s (%zu pages) is too large for available (%zu pages) or total " + "(%zu pages) system RAM", + "database upper size limit", limit >> log2page, avail_ram_pages, + total_ram_pages); return MDBX_TOO_LARGE; } } @@ -28136,7 +28459,7 @@ static int check_mmap_limit(const size_t limit) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t size, const size_t limit, const unsigned options) { assert(size <= limit); @@ -28148,7 +28471,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, map->section = NULL; #endif /* Windows */ - int err = mdbx_check_fs_local(map->fd, flags); + int err = osal_check_fs_local(map->fd, flags); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -28157,7 +28480,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, return err; if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); if (err != MDBX_SUCCESS) return err; map->filesize = size; @@ -28165,7 +28488,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, map->current = size; #endif /* !Windows */ } else { - err = mdbx_filesize(map->fd, &map->filesize); + err = osal_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) return err; #if !(defined(_WIN32) || defined(_WIN64)) @@ -28272,7 +28595,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. @@ -28299,7 +28622,7 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit) { assert(size <= limit); #if defined(_WIN32) || defined(_WIN64) @@ -28396,12 +28719,12 @@ retry_file_and_section: map->address = NULL; } - err = mdbx_filesize(map->fd, &map->filesize); + err = osal_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) goto bailout; if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); if (err == MDBX_SUCCESS) map->filesize = size; /* ignore error, because Windows unable shrink file @@ -28477,7 +28800,7 @@ retry_mapview:; #else /* Windows */ map->filesize = 0; - int rc = mdbx_filesize(map->fd, &map->filesize); + int rc = osal_filesize(map->fd, &map->filesize); if (rc != MDBX_SUCCESS) return rc; @@ -28487,7 +28810,7 @@ retry_mapview:; rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; } else { if (map->filesize != size) { - rc = mdbx_ftruncate(map->fd, size); + rc = osal_ftruncate(map->fd, size); if (rc != MDBX_SUCCESS) return rc; map->filesize = size; @@ -28669,7 +28992,7 @@ retry_mapview:; /*----------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny) { +__cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { for (;;) { #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__) @@ -28724,8 +29047,7 @@ static LARGE_INTEGER performance_frequency; static uint64_t ratio_16dot16_to_monotine; #endif -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #if defined(_WIN32) || defined(_WIN64) if (unlikely(performance_frequency.QuadPart == 0)) QueryPerformanceFrequency(&performance_frequency); @@ -28744,13 +29066,13 @@ mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; } -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { static uint64_t limit; if (unlikely(monotime > limit)) { - if (limit != 0) + if (likely(limit != 0)) return UINT32_MAX; - limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); - if (monotime > limit) + limit = osal_16dot16_to_monotime(UINT32_MAX - 1); + if (unlikely(monotime > limit)) return UINT32_MAX; } const uint32_t ret = @@ -28761,10 +29083,12 @@ MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { #else (uint32_t)(monotime * 128 / 1953125); #endif - return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1; + if (likely(ret > 0)) + return ret; + return monotime > 0 /* fix underflow */; } -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER counter; counter.QuadPart = 0; @@ -28804,7 +29128,7 @@ static void bootid_shake(bin128_t *p) { p->d = e + p->a; } -static void bootid_collect(bin128_t *p, const void *s, size_t n) { +__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { p->y += UINT64_C(64526882297375213); bootid_shake(p); for (size_t i = 0; i < n; ++i) { @@ -28933,7 +29257,7 @@ bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { return false; } -__cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) { +__cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) { bin128_t bin = {{0, 0}}; bool got_machineid = false, got_boottime = false, got_bootseq = false; @@ -29246,7 +29570,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, if (avail_pages) *avail_pages = -1; - const intptr_t pagesize = mdbx_syspagesize(); + const intptr_t pagesize = osal_syspagesize(); if (page_size) *page_size = pagesize; if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) @@ -29362,7 +29686,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, #if MDBX_VERSION_MAJOR != 0 || \ - MDBX_VERSION_MINOR != 11 + MDBX_VERSION_MINOR != 12 #error "API version mismatch! Had `git fetch --tags` done?" #endif @@ -29382,11 +29706,11 @@ __dll_export #endif const struct MDBX_version_info mdbx_version = { 0, - 11, - 8, + 12, + 1, 0, - {"2022-06-12T23:47:18+03:00", "42c5683febaffacc19a4e3e6dbebfffbd9ea92da", "bd80e01eda6f0220dd06a80da838ebbe3efca95c", - "v0.11.8-0-gbd80e01e"}, + {"2022-08-24T16:24:22+03:00", "0803c79d2d94f2d1496166a9a86bd47da18c7eed", "b36a07a512c1412d5753219aa8fc66cab75a012a", + "v0.12.1-0-gb36a07a5"}, sourcery}; __dll_export @@ -29451,16 +29775,16 @@ static switch (reason) { case DLL_PROCESS_ATTACH: mdbx_winnt_import(); - mdbx_rthc_global_init(); + global_ctor(); break; case DLL_PROCESS_DETACH: - mdbx_rthc_global_dtor(); + global_dtor(); break; case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: - mdbx_rthc_thread_dtor(module); + thread_dtor(module); break; } #if MDBX_BUILD_SHARED_LIBRARY @@ -29585,8 +29909,8 @@ void mdbx_txn_unlock(MDBX_env *env) { #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_srwlock_AcquireShared(&env->me_remap_guard); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + osal_srwlock_AcquireShared(&env->me_remap_guard); if (env->me_lfd == INVALID_HANDLE_VALUE) return MDBX_SUCCESS; /* readonly database in readonly filesystem */ @@ -29597,21 +29921,21 @@ MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { return MDBX_SUCCESS; int rc = (int)GetLastError(); - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && !funlock(env->me_lfd, LCK_UPPER)) mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); } - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { return flock(fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, @@ -29624,7 +29948,7 @@ static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; if ((*array)->count == limit) { - void *ptr = mdbx_realloc( + void *ptr = osal_realloc( (limit > ARRAY_LENGTH((*array)->handles)) ? *array : /* don't free initial array on the stack */ NULL, @@ -29658,8 +29982,8 @@ static int suspend_and_append(mdbx_handle_array_t **array, } MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -29681,7 +30005,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); if (rc != MDBX_SUCCESS) { bailout_lck: - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } } @@ -29693,7 +30017,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } else { /* Without LCK (i.e. read-only mode). * Walk through a snapshot of all running threads */ - mdbx_assert(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); + eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); if (hSnapshot == INVALID_HANDLE_VALUE) return (int)GetLastError(); @@ -29705,7 +30029,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = (int)GetLastError(); bailout_toolhelp: CloseHandle(hSnapshot); - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } @@ -29730,7 +30054,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { +osal_resume_threads_after_remap(mdbx_handle_array_t *array) { int rc = MDBX_SUCCESS; for (unsigned i = 0; i < array->count; ++i) { const HANDLE hThread = array->handles[i]; @@ -29769,11 +30093,11 @@ mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { * E-S * E-E = exclusive-write, i.e. exclusive due (re)initialization * - * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * The osal_lck_seize() moves the locking-FSM from the initial free/unlocked * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, * or to the "used" (and returns MDBX_RESULT_FALSE). * - * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" * state to the "used" (i.e. shared) state. * * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) @@ -29831,21 +30155,21 @@ static int internal_seize_lck(HANDLE lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc); + ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ rc = (int)GetLastError(); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) @@ -29855,13 +30179,13 @@ static int internal_seize_lck(HANDLE lfd) { } /* 7) still on ?-E (middle), try S-E (locked) */ - mdbx_jitter4testing(false); + jitter4testing(false); rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE : (int)GetLastError(); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc != MDBX_RESULT_FALSE) - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transition to S-? (used) or ?-? (free) */ @@ -29873,7 +30197,7 @@ static int internal_seize_lck(HANDLE lfd) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { int rc; assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); @@ -29884,17 +30208,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = (int)GetLastError(); - mdbx_error("%s, err %u", "without-lck", rc); + ERROR("%s, err %u", "without-lck", rc); return rc; } return MDBX_RESULT_FALSE; } rc = internal_seize_lck(env->me_lfd); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be @@ -29904,11 +30228,11 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { * while opening db in valid (non-conflict) mode. */ if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { rc = (int)GetLastError(); - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_jitter4testing(false); + ERROR("%s, err %u", "lock-against-without-lck", rc); + jitter4testing(false); lck_unlock(env); } else { - mdbx_jitter4testing(false); + jitter4testing(false); if (!funlock(env->me_lazy_fd, LCK_BODY)) mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", (int)GetLastError()); @@ -29918,7 +30242,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { /* Transite from exclusive-write state (E-E) to used (S-?) */ assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); @@ -29934,7 +30258,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* 2) now at ?-E (middle), transition to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { int rc = (int)GetLastError() /* 3) something went wrong, give up */; - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } @@ -29956,10 +30280,10 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { int rc; /* 1) now on S-? (used), try S-E (locked) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_verbose("%s, err %u", "S-?(used) >> S-E(locked)", rc); + VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; } @@ -29969,17 +30293,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { (int)GetLastError()); /* 4) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { rc = (int)GetLastError() /* 5) something went wrong, give up */; - mdbx_verbose("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); + VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; } return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; } -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { (void)env; @@ -29988,19 +30312,19 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { /* LY: should unmap before releasing the locks to avoid race condition and * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ if (env->me_map) - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); if (env->me_lck_mmap.lck) { const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && mdbx_lck_upgrade(env) == MDBX_SUCCESS) /* this will fail if LCK is used/mmapped by other process(es) */ - mdbx_ftruncate(env->me_lfd, 0); + osal_ftruncate(env->me_lfd, 0); } lck_unlock(env); return MDBX_SUCCESS; @@ -30009,12 +30333,12 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*----------------------------------------------------------------------------*/ /* reader checking (by pid) */ -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } @@ -30025,7 +30349,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { (void)env; HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); int rc; @@ -30062,11 +30386,11 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { // Stub for slim read-write lock // Copyright (C) 1995-2002 Brad Wilson -static void WINAPI stub_srwlock_Init(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { srwl->readerCount = srwl->writerCount = 0; } -static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); @@ -30091,12 +30415,12 @@ static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { } } -static void WINAPI stub_srwlock_ReleaseShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { assert(srwl->readerCount > 0); _InterlockedDecrement(&srwl->readerCount); } -static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); @@ -30122,7 +30446,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { } } -static void WINAPI stub_srwlock_ReleaseExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { assert(srwl->writerCount == 1 && srwl->readerCount >= 0); srwl->writerCount = 0; } @@ -30138,9 +30462,9 @@ static uint64_t WINAPI stub_GetTickCount64(void) { /*----------------------------------------------------------------------------*/ #ifndef xMDBX_ALLOY -MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, - mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive, - mdbx_srwlock_ReleaseExclusive; +osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, + osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, + osal_srwlock_ReleaseExclusive; MDBX_NtExtendSection mdbx_NtExtendSection; MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; @@ -30188,24 +30512,24 @@ static void mdbx_winnt_import(void) { GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); #undef GET_PROC_ADDR - const MDBX_srwlock_function init = - (MDBX_srwlock_function)GetProcAddress(hKernel32dll, "InitializeSRWLock"); + const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "InitializeSRWLock"); if (init != NULL) { - mdbx_srwlock_Init = init; - mdbx_srwlock_AcquireShared = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_Init = init; + osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "AcquireSRWLockShared"); - mdbx_srwlock_ReleaseShared = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "ReleaseSRWLockShared"); - mdbx_srwlock_AcquireExclusive = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "AcquireSRWLockExclusive"); - mdbx_srwlock_ReleaseExclusive = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "ReleaseSRWLockExclusive"); } else { - mdbx_srwlock_Init = stub_srwlock_Init; - mdbx_srwlock_AcquireShared = stub_srwlock_AcquireShared; - mdbx_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; - mdbx_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; - mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + osal_srwlock_Init = stub_srwlock_Init; + osal_srwlock_AcquireShared = stub_srwlock_AcquireShared; + osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; + osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; + osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } } @@ -30243,10 +30567,11 @@ static void mdbx_winnt_import(void) { #include #ifndef xMDBX_ALLOY -uint32_t mdbx_linux_kernel_version; +uint32_t linux_kernel_version; bool mdbx_RunningOnWSL1; #endif /* xMDBX_ALLOY */ +MDBX_EXCLUDE_FOR_GPROF __cold static uint8_t probe_for_WSL(const char *tag) { const char *const WSL = strstr(tag, "WSL"); if (WSL && WSL[3] >= '2' && WSL[3] <= '9') @@ -30257,14 +30582,28 @@ __cold static uint8_t probe_for_WSL(const char *tag) { if (WSL || wsl || strcasestr(tag, "Microsoft")) /* Expecting no new kernel within WSL1, either it will explicitly * marked by an appropriate WSL-version hint. */ - return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; return 0; } #endif /* Linux */ +#ifdef ENABLE_GPROF +extern void _mcleanup(void); +extern void monstartup(unsigned long, unsigned long); +extern void _init(void); +extern void _fini(void); +extern void __gmon_start__(void) __attribute__((__weak__)); +#endif /* ENABLE_GPROF */ + +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__constructor__)) void mdbx_global_constructor(void) { +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + monstartup((uintptr_t)&_init, (uintptr_t)&_fini); +#endif /* ENABLE_GPROF */ + #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { @@ -30276,7 +30615,7 @@ mdbx_global_constructor(void) { if (number > 0) { if (number > 255) number = 255; - mdbx_linux_kernel_version += number << (24 - i * 8); + linux_kernel_version += number << (24 - i * 8); } ++i; } else { @@ -30296,12 +30635,17 @@ mdbx_global_constructor(void) { } #endif /* Linux */ - mdbx_rthc_global_init(); + global_ctor(); } +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__destructor__)) void mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); + global_dtor(); +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + _mcleanup(); +#endif /* ENABLE_GPROF */ } /*----------------------------------------------------------------------------*/ @@ -30313,15 +30657,15 @@ mdbx_global_destructor(void) { * размещаются совместно используемые posix-мьютексы (futex). Посредством * этих мьютексов (см struct MDBX_lockinfo) реализуются: * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). * - Блокировка БД для пишущих транзакций, * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). * * Остальной функционал реализуется отдельно посредством файловых блокировок: * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * в операционный режим, функции osal_lck_seize() и osal_lck_downgrade(). * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check(). * * Для блокировки файлов используется fcntl(F_SETLK), так как: * - lockf() оперирует только эксклюзивной блокировкой и требует @@ -30365,9 +30709,9 @@ mdbx_global_destructor(void) { static int op_setlk, op_setlkw, op_getlk; __cold static void choice_fcntl(void) { assert(!op_setlk && !op_setlkw && !op_getlk); - if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 + if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 #if defined(__linux__) || defined(__gnu_linux__) - && mdbx_linux_kernel_version > + && linux_kernel_version > 0x030f0000 /* OFD locks are available since 3.15, but engages here only for 3.16 and later kernels (i.e. LTS) because of reliability reasons */ @@ -30402,7 +30746,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, "The bitness of system `off_t` type is mismatch. Please " "fix build and/or NDK configuration."); #endif /* Android */ - mdbx_jitter4testing(true); + jitter4testing(true); assert(offset >= 0 && len > 0); assert((uint64_t)offset < (uint64_t)INT64_MAX && (uint64_t)len < (uint64_t)INT64_MAX && @@ -30416,16 +30760,19 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, ((uint64_t)offset + (uint64_t)len)); for (;;) { struct flock lock_op; - STATIC_ASSERT(sizeof(off_t) <= sizeof(lock_op.l_start) && - sizeof(off_t) <= sizeof(lock_op.l_len) && - OFF_T_MAX == (off_t)OFF_T_MAX); + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) && + sizeof(off_t) <= sizeof(lock_op.l_len) && + OFF_T_MAX == (off_t)OFF_T_MAX, + "Support for large/64-bit-sized files is misconfigured " + "for the target system and/or toolchain. " + "Please fix it or at least disable it completely."); memset(&lock_op, 0, sizeof(lock_op)); lock_op.l_type = lck; lock_op.l_whence = SEEK_SET; lock_op.l_start = offset; lock_op.l_len = len; int rc = fcntl(fd, cmd, &lock_op); - mdbx_jitter4testing(true); + jitter4testing(true); if (rc != -1) { if (cmd == op_getlk) { /* Checks reader by pid. Returns: @@ -30460,7 +30807,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, } } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) choice_fcntl(); @@ -30468,21 +30815,21 @@ MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); } -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(pid > 0); return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); @@ -30491,7 +30838,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ #if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -30502,7 +30849,7 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_destroy(ipc) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -30520,7 +30867,7 @@ static int check_fstat(MDBX_env *env) { int rc = MDBX_SUCCESS; if (fstat(env->me_lazy_fd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "DXB", rc); + ERROR("fstat(%s), err %d", "DXB", rc); return rc; } @@ -30530,15 +30877,14 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "DXB", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { - mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } @@ -30546,7 +30892,7 @@ static int check_fstat(MDBX_env *env) { if (fstat(env->me_lfd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "LCK", rc); + ERROR("fstat(%s), err %d", "LCK", rc); return rc; } @@ -30556,26 +30902,25 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "LCK", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } /* Checking file size for detect the situation when we got the shared lock - * immediately after mdbx_lck_destroy(). */ + * immediately after osal_lck_destroy(). */ if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { - mdbx_verbose("lck-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { +__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) @@ -30586,10 +30931,10 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { #if defined(__linux__) || defined(__gnu_linux__) if (unlikely(mdbx_RunningOnWSL1)) { rc = ENOLCK /* No record locks available */; - mdbx_error("%s, err %u", - "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " - "injecting failure to avoid data loss", - rc); + ERROR("%s, err %u", + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " + "injecting failure to avoid data loss", + rc); return rc; } #endif /* Linux */ @@ -30600,8 +30945,8 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; @@ -30614,8 +30959,8 @@ retry: if (rc == MDBX_RESULT_TRUE) { rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "unlock-before-retry", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "unlock-before-retry", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } } @@ -30641,23 +30986,23 @@ retry: /* the cause may be a collision with POSIX's file-lock recovery. */ if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "dxb-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "dxb-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Fallback to lck-shared */ } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Here could be one of two: - * - mdbx_lck_destroy() from the another process was hold the lock + * - osal_lck_destroy() from the another process was hold the lock * during a destruction. - * - either mdbx_lck_seize() from the another process was got the exclusive + * - either osal_lck_seize() from the another process was got the exclusive * lock and doing initialization. * For distinguish these cases will use size of the lck-file later. */ @@ -30666,8 +31011,8 @@ retry: * competing process doesn't call lck_downgrade(). */ rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "try-shared", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-shared", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -30675,7 +31020,7 @@ retry: if (rc == MDBX_RESULT_TRUE) goto retry; if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lck_fstat", rc); + ERROR("%s, err %u", "lck_fstat", rc); return rc; } @@ -30686,8 +31031,8 @@ retry: if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -30696,8 +31041,8 @@ retry: lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "lock-against-without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -30705,9 +31050,9 @@ retry: return MDBX_RESULT_FALSE; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -30720,15 +31065,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { if (rc == MDBX_SUCCESS) rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); if (unlikely(rc != 0)) { - mdbx_error("%s, err %u", "lck", rc); + ERROR("%s, err %u", "lck", rc); assert(MDBX_IS_ERROR(rc)); } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -30743,25 +31088,25 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_verbose("%p got exclusive, drown locks", (void *)env); + VERBOSE("%p got exclusive, drown locks", (void *)env); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; #else - rc = mdbx_ipclock_destroy(&lck->mti_rlock); + rc = osal_ipclock_destroy(&lck->mti_rlock); if (rc == 0) - rc = mdbx_ipclock_destroy(&lck->mti_wlock); + rc = osal_ipclock_destroy(&lck->mti_wlock); #endif /* MDBX_LOCKING */ - mdbx_assert(env, rc == 0); + eASSERT(env, rc == 0); if (rc == 0) { const bool synced = lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced) rc = ftruncate(env->me_lfd, 0) ? errno : 0; } - mdbx_jitter4testing(false); + jitter4testing(false); } /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored @@ -30802,7 +31147,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /* restore file-locks */ rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) - rc = mdbx_rpid_set(inprocess_neighbor); + rc = osal_rpid_set(inprocess_neighbor); } } @@ -30813,7 +31158,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*---------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -30960,7 +31305,7 @@ bailout: #endif /* MDBX_LOCKING > 0 */ } -__cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, +__cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, const int err) { int rc = err; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -30977,10 +31322,10 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, rc = MDBX_PANIC; } } - mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); + WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); - int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL); + int check_rc = cleanup_dead_readers(env, rlocked, NULL); check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -30998,7 +31343,7 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; if (unlikely(mreco_rc)) - mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); + ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); rc = (rc == MDBX_SUCCESS) ? check_rc : rc; if (MDBX_IS_ERROR(rc)) @@ -31021,24 +31366,24 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, #error "FIXME" #endif /* MDBX_LOCKING */ - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); if (rc != EDEADLK) env->me_flags |= MDBX_FATAL_ERROR; return rc; } #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { /* avoid 32-bit Bionic bug/hang with 32-pit TID */ if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { pid_t tid = gettid(); if (unlikely(tid > 0xffff)) { - mdbx_fatal("Raise the ENOSYS(%d) error to avoid hang due " - "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " - "that don’t fit in 16 bits, see " - "https://android.googlesource.com/platform/bionic/+/master/" - "docs/32-bit-abi.md#is-too-small-for-large-pids", - ENOSYS, tid, tid); + FATAL("Raise the ENOSYS(%d) error to avoid hang due " + "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " + "that don’t fit in 16 bits, see " + "https://android.googlesource.com/platform/bionic/+/master/" + "docs/32-bit-abi.md#is-too-small-for-large-pids", + ENOSYS, tid, tid); return ENOSYS; } } @@ -31046,11 +31391,11 @@ MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, +static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, const bool dont_wait) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = mdbx_check_tid4bionic(); + int rc = osal_check_tid4bionic(); if (likely(rc == 0)) rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; @@ -31086,7 +31431,7 @@ static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, return rc; } -static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { +static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 int rc = pthread_mutex_unlock(ipc); @@ -31108,38 +31453,38 @@ static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace("%s", ">>"); - mdbx_jitter4testing(true); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + TRACE("%s", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { - mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>"); - mdbx_jitter4testing(true); + TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; } void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } #else diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ index 63689b66a..2b9ae758a 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -288,11 +288,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -436,8 +437,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -505,18 +507,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -678,6 +712,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -697,6 +733,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -741,6 +779,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -895,6 +956,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -958,7 +1029,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -978,7 +1049,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1016,8 +1087,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1025,8 +1096,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1035,36 +1106,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1076,23 +1147,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1110,7 +1184,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1120,7 +1194,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1133,7 +1213,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1141,13 +1221,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1157,12 +1237,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1173,8 +1253,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1184,15 +1264,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1201,57 +1281,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1260,25 +1340,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1286,17 +1367,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1306,7 +1388,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1319,24 +1401,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1352,7 +1433,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1373,7 +1454,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1381,14 +1462,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1401,14 +1482,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1423,15 +1504,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1441,14 +1522,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1685,6 +1780,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1694,11 +1801,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1957,14 +2064,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1973,6 +2077,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2041,8 +2149,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2098,15 +2206,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2141,26 +2249,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2268,7 +2376,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2287,11 +2398,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2334,21 +2448,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2365,6 +2482,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2384,16 +2509,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2401,17 +2529,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2528,7 +2656,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2554,7 +2682,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2661,6 +2789,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2712,6 +2841,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2723,7 +2861,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2746,9 +2884,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2758,7 +2896,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2785,6 +2923,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2809,11 +2948,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2854,8 +2993,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2865,18 +3004,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2909,13 +3057,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2926,18 +3076,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -2986,7 +3136,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -2995,11 +3145,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3034,142 +3184,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3194,15 +3340,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3264,8 +3410,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3408,7 +3552,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3433,15 +3578,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) // @@ -3645,70 +3790,6 @@ __cold bug::~bug() noexcept {} #endif /* Unused*/ -//------------------------------------------------------------------------------ - -template struct path_to_pchar { - const std::string str; - path_to_pchar(const PATH &path) : str(path.generic_string()) {} - operator const char *() const { return str.c_str(); } -}; - -template -MDBX_MAYBE_UNUSED PATH pchar_to_path(const char *c_str) { - return PATH(c_str); -} - -template <> struct path_to_pchar { - const char *const ptr; - path_to_pchar(const std::string &path) : ptr(path.c_str()) {} - operator const char *() const { return ptr; } -}; - -#if defined(_WIN32) || defined(_WIN64) - -#ifndef WC_ERR_INVALID_CHARS -static const DWORD WC_ERR_INVALID_CHARS = - (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) - ? 0x00000080 - : 0; -#endif /* WC_ERR_INVALID_CHARS */ - -template <> struct path_to_pchar { - std::string str; - path_to_pchar(const std::wstring &path) { - if (!path.empty()) { - const int chars = - WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, path.data(), - int(path.size()), nullptr, 0, nullptr, nullptr); - if (chars == 0) - mdbx::error::throw_exception(GetLastError()); - str.append(chars, '\0'); - WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, path.data(), - int(path.size()), const_cast(str.data()), - chars, nullptr, nullptr); - } - } - operator const char *() const { return str.c_str(); } -}; - -template <> -MDBX_MAYBE_UNUSED std::wstring pchar_to_path(const char *c_str) { - std::wstring wstr; - if (c_str && *c_str) { - const int chars = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, c_str, - int(strlen(c_str)), nullptr, 0); - if (chars == 0) - mdbx::error::throw_exception(GetLastError()); - wstr.append(chars, '\0'); - MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, c_str, - int(strlen(c_str)), const_cast(wstr.data()), - chars); - } - return wstr; -} - -#endif /* Windows */ - } // namespace //------------------------------------------------------------------------------ @@ -4689,43 +4770,6 @@ bool env::is_pristine() const { bool env::is_empty() const { return get_stat().ms_leaf_pages == 0; } -#ifdef MDBX_STD_FILESYSTEM_PATH -env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, - bool force_dynamic_size) { - const path_to_pchar utf8(destination); - error::success_or_throw( - ::mdbx_env_copy(handle_, utf8, - (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | - (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE - : MDBX_CP_DEFAULTS))); - return *this; -} -#endif /* MDBX_STD_FILESYSTEM_PATH */ - -#if defined(_WIN32) || defined(_WIN64) -env &env::copy(const ::std::wstring &destination, bool compactify, - bool force_dynamic_size) { - const path_to_pchar<::std::wstring> utf8(destination); - error::success_or_throw( - ::mdbx_env_copy(handle_, utf8, - (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | - (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE - : MDBX_CP_DEFAULTS))); - return *this; -} -#endif /* Windows */ - -env &env::copy(const ::std::string &destination, bool compactify, - bool force_dynamic_size) { - const path_to_pchar<::std::string> utf8(destination); - error::success_or_throw( - ::mdbx_env_copy(handle_, utf8, - (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | - (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE - : MDBX_CP_DEFAULTS))); - return *this; -} - env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { error::success_or_throw( ::mdbx_env_copy2fd(handle_, fd, @@ -4735,35 +4779,86 @@ env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { return *this; } +env &env::copy(const char *destination, bool compactify, + bool force_dynamic_size) { + error::success_or_throw( + ::mdbx_env_copy(handle_, destination, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); + return *this; +} + +env &env::copy(const ::std::string &destination, bool compactify, + bool force_dynamic_size) { + return copy(destination.c_str(), compactify, force_dynamic_size); +} + +#if defined(_WIN32) || defined(_WIN64) +env &env::copy(const wchar_t *destination, bool compactify, + bool force_dynamic_size) { + error::success_or_throw( + ::mdbx_env_copyW(handle_, destination, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); + return *this; +} + +env &env::copy(const ::std::wstring &destination, bool compactify, + bool force_dynamic_size) { + return copy(destination.c_str(), compactify, force_dynamic_size); +} +#endif /* Windows */ + +#ifdef MDBX_STD_FILESYSTEM_PATH +env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, + bool force_dynamic_size) { + return copy(destination.native(), compactify, force_dynamic_size); +} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + path env::get_path() const { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *c_wstr; + error::success_or_throw(::mdbx_env_get_pathW(handle_, &c_wstr)); + static_assert(sizeof(path::value_type) == sizeof(wchar_t), "Oops"); + return path(c_wstr); +#else const char *c_str; error::success_or_throw(::mdbx_env_get_path(handle_, &c_str)); - return pchar_to_path(c_str); + static_assert(sizeof(path::value_type) == sizeof(char), "Oops"); + return path(c_str); +#endif } +bool env::remove(const char *pathname, const remove_mode mode) { + return error::boolean_or_throw( + ::mdbx_env_delete(pathname, MDBX_env_delete_mode_t(mode))); +} + +bool env::remove(const ::std::string &pathname, const remove_mode mode) { + return remove(pathname.c_str(), mode); +} + +#if defined(_WIN32) || defined(_WIN64) +bool env::remove(const wchar_t *pathname, const remove_mode mode) { + return error::boolean_or_throw( + ::mdbx_env_deleteW(pathname, MDBX_env_delete_mode_t(mode))); +} + +bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { + return remove(pathname.c_str(), mode); +} +#endif /* Windows */ + #ifdef MDBX_STD_FILESYSTEM_PATH bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, const remove_mode mode) { - const path_to_pchar utf8(pathname); - return error::boolean_or_throw( - ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); + return remove(pathname.native(), mode); } #endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) -bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { - const path_to_pchar<::std::wstring> utf8(pathname); - return error::boolean_or_throw( - ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); -} -#endif /* Windows */ - -bool env::remove(const ::std::string &pathname, const remove_mode mode) { - const path_to_pchar<::std::string> utf8(pathname); - return error::boolean_or_throw( - ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); -} - //------------------------------------------------------------------------------ static inline MDBX_env *create_env() { @@ -4800,97 +4895,91 @@ __cold void env_managed::setup(unsigned max_maps, unsigned max_readers) { error::success_or_throw(::mdbx_env_set_maxdbs(handle_, max_maps)); } +__cold env_managed::env_managed(const char *pathname, + const operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + error::success_or_throw( + ::mdbx_env_open(handle_, pathname, op.make_flags(accede), 0)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const char *pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + set_geometry(cp.geometry); + error::success_or_throw(::mdbx_env_open( + handle_, pathname, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const ::std::string &pathname, + const operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), op, accede) {} + +__cold env_managed::env_managed(const ::std::string &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), cp, op, accede) {} + +#if defined(_WIN32) || defined(_WIN64) +__cold env_managed::env_managed(const wchar_t *pathname, + const operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + error::success_or_throw( + ::mdbx_env_openW(handle_, pathname, op.make_flags(accede), 0)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const wchar_t *pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + set_geometry(cp.geometry); + error::success_or_throw(::mdbx_env_openW( + handle_, pathname, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), op, accede) {} + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), cp, op, accede) {} +#endif /* Windows */ + #ifdef MDBX_STD_FILESYSTEM_PATH __cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, const operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar utf8(pathname); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} + : env_managed(pathname.native(), op, accede) {} __cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, const env_managed::create_parameters &cp, const env::operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar utf8(pathname); - set_geometry(cp.geometry); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), - cp.file_mode_bits)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} + : env_managed(pathname.native(), cp, op, accede) {} #endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) -__cold env_managed::env_managed(const ::std::wstring &pathname, - const operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::wstring> utf8(pathname); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} - -__cold env_managed::env_managed(const ::std::wstring &pathname, - const env_managed::create_parameters &cp, - const env::operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::wstring> utf8(pathname); - set_geometry(cp.geometry); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), - cp.file_mode_bits)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} -#endif /* Windows */ - -__cold env_managed::env_managed(const ::std::string &pathname, - const operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::string> utf8(pathname); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} - -__cold env_managed::env_managed(const ::std::string &pathname, - const env_managed::create_parameters &cp, - const env::operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::string> utf8(pathname); - set_geometry(cp.geometry); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), - cp.file_mode_bits)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} - //------------------------------------------------------------------------------ txn_managed txn::start_nested() { diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h index c6211aba2..f536d41df 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h @@ -626,9 +626,9 @@ typedef mode_t mdbx_mode_t; extern "C" { #endif -/* MDBX version 0.11.x */ +/* MDBX version 0.12.x */ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 11 +#define MDBX_VERSION_MINOR 12 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) @@ -824,13 +824,33 @@ enum MDBX_constants { /* THE FILES ******************************************************************* * At the file system level, the environment corresponds to a pair of files. */ -/** \brief The name of the lock file in the environment */ +#ifndef MDBX_LOCKNAME +/** \brief The name of the lock file in the environment + * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCKNAME "/mdbx.lck" -/** \brief The name of the data file in the environment */ +#else +#define MDBX_LOCKNAME L"\\mdbx.lck" +#endif +#endif /* MDBX_LOCKNAME */ +#ifndef MDBX_DATANAME +/** \brief The name of the data file in the environment + * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_DATANAME "/mdbx.dat" +#else +#define MDBX_DATANAME L"\\mdbx.dat" +#endif +#endif /* MDBX_DATANAME */ +#ifndef MDBX_LOCK_SUFFIX /** \brief The suffix of the lock file when \ref MDBX_NOSUBDIR is used */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCK_SUFFIX "-lck" +#else +#define MDBX_LOCK_SUFFIX L"-lck" +#endif +#endif /* MDBX_LOCK_SUFFIX */ /* DEBUG & LOGGING ************************************************************/ @@ -1015,6 +1035,13 @@ LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, enum MDBX_env_flags_t { MDBX_ENV_DEFAULTS = 0, + /** Extra validation of DB structure and pages content. + * + * The `MDBX_VALIDATION` enabled the simple safe/careful mode for working + * with damaged or untrusted DB. However, a notable performance + * degradation should be expected. */ + MDBX_VALIDATION = UINT32_C(0x00002000), + /** No environment directory. * * By default, MDBX creates its environment in a directory whose pathname is @@ -1087,8 +1114,8 @@ enum MDBX_env_flags_t { * while opening the database/environment which is already used by another * process(es) with unknown mode/flags. In such cases, if there is a * difference in the specified flags (\ref MDBX_NOMETASYNC, - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM, - * \ref MDBX_COALESCE and \ref MDBX_NORDAHEAD), instead of returning an error, + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM + * and \ref MDBX_NORDAHEAD), instead of returning an error, * the database will be opened in a compatibility with the already used mode. * * `MDBX_ACCEDE` has no effect if the current process is the only one either @@ -1195,6 +1222,7 @@ enum MDBX_env_flags_t { MDBX_NOMEMINIT = UINT32_C(0x1000000), /** Aims to coalesce a Garbage Collection items. + * \note Always enabled since v0.12 * * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling * a Garbage Collection. Technically, when possible short lists of pages @@ -2259,6 +2287,11 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathnameW, + MDBX_env_flags_t flags, mdbx_mode_t mode); +#endif /* Windows */ + /** \brief Deletion modes for \ref mdbx_env_delete(). * \ingroup c_extra * \see mdbx_env_delete() */ @@ -2301,6 +2334,10 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; * so no deletion was performed. */ LIBMDBX_API int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, + MDBX_env_delete_mode_t mode); +#endif /* Windows */ /** \brief Copy an MDBX environment to the specified path, with options. * \ingroup c_extra @@ -2335,6 +2372,10 @@ LIBMDBX_API int mdbx_env_delete(const char *pathname, * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, MDBX_copy_flags_t flags); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest, + MDBX_copy_flags_t flags); +#endif /* Windows */ /** \brief Copy an environment to the specified file descriptor, with * options. @@ -2482,6 +2523,9 @@ struct MDBX_envinfo { uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t wops; /**< Number of explicit write operations (not a pages) to a disk */ + uint64_t + gcrtime_seconds16dot16; /**< Time spent loading and searching inside + GC (aka FreeDB) in 1/65536 of second. */ } mi_pgop_stat; }; #ifndef __cplusplus @@ -2784,7 +2828,11 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ +#if !(defined(_WIN32) || defined(_WIN64)) LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); +#else +LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); +#endif /* Windows */ /** \brief Return the file descriptor for the given environment. * \ingroup c_statinfo @@ -5055,11 +5103,12 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * this value into account to evaluate the impact that * a long-running transaction has. * \param [in] retry A retry number starting from 0. - * If callback has returned 0 at least once, then at end - * of current handling loop the callback function will be - * called additionally with negative value to notify about - * the end of loop. The callback function can use this value - * to implement timeout logic while waiting for readers. + * If callback has returned 0 at least once, then at end of + * current handling loop the callback function will be + * called additionally with negative `retry` value to notify + * about the end of loop. The callback function can use this + * fact to implement timeout reset logic while waiting for + * a readers. * * \returns The RETURN CODE determines the further actions libmdbx and must * match the action which was executed by the callback: @@ -5082,7 +5131,7 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * \retval 1 Transaction aborted asynchronous and reader slot * should be cleared immediately, i.e. read transaction * will not continue but \ref mdbx_txn_abort() - * or \ref mdbx_txn_reset() will be called later. + * nor \ref mdbx_txn_reset() will be called later. * * \retval 2 or great The reader process was terminated or killed, * and libmdbx should entirely reset reader registration. @@ -5175,6 +5224,12 @@ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, + const wchar_t *pathnameW, + unsigned target_meta, + bool writeable); +#endif /* Windows */ /** \brief Turn database to the specified meta-page. * @@ -5185,230 +5240,8 @@ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); /** end of btree_traversal @} */ -/**** Attribute support functions for Nexenta (scheduled for removal) - * *****************************************************************/ -#if defined(MDBX_NEXENTA_ATTRS) || defined(DOXYGEN) -/** \defgroup nexenta Attribute support functions for Nexenta - * \ingroup c_crud - * @{ */ -typedef uint_fast64_t mdbx_attr_t; - -/** Store by cursor with attribute. - * - * This function stores key/data pairs into the database. The cursor is - * positioned at the new item, or on failure usually near it. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open() - * \param [in] key The key operated on. - * \param [in] data The data operated on. - * \param [in] attr The attribute. - * \param [in] flags Options for this operation. This parameter must be set - * to 0 or one of the values described here: - * - \ref MDBX_CURRENT - * Replace the item at the current cursor position. The key parameter - * must still be provided, and must match it, otherwise the function - * return \ref MDBX_EKEYMISMATCH. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. No key - * comparisons are performed. This option allows fast bulk loading when - * keys are already known to be in the correct order. Loading unsorted - * keys with this flag will cause a \ref MDBX_KEYEXIST error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_EKEYMISMATCH - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write in a read-only - * transaction. - * \retval MDBX_EINVAL an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Store items and attributes into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to store in the database. - * \param [in] attr The attribute to store in the database. - * \param [in,out] data The data to store. - * \param [in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or - * more of the values described here: - * - \ref MDBX_NOOVERWRITE - * Enter the new key/data pair only if the key does not already appear - * in the database. The function will return \ref MDBX_KEYEXIST if the key - * already appears in the database. The data parameter will be set to - * point to the existing item. - * - * - \ref MDBX_CURRENT - * Update an single existing entry, but not add new ones. The function - * will return \ref MDBX_NOTFOUND if the given key not exist in the - * database. Or the \ref MDBX_EMULTIVAL in case duplicates for the given - * key. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. This option - * allows fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a \ref MDBX_EKEYMISMATCH error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_KEYEXIST - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write - * in a read-only transaction. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Set items attribute from a database. - * - * This function stores key/data pairs attribute to the database. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in] data The data to be stored or NULL to save previous value. - * \param [in] attr The attribute to be stored. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key-value pair was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr); - -/** Get items attribute from a database cursor. - * - * This function retrieves key/data pairs from the database. The address and - * length of the key are returned in the object to which key refers (except - * for the case of the \ref MDBX_SET option, in which the key object is - * unchanged), and the address and length of the data are returned in the object - * to which data refers. - * \see mdbx_get() - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). - * \param [in,out] key The key for a retrieved item. - * \param [in,out] data The data of a retrieved item. - * \param [out] pattr The pointer to retrieve attribute. - * \param [in] op A cursor operation MDBX_cursor_op. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND No matching key found. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr, - MDBX_cursor_op op); - -/** Get items attribute from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified key are returned - * in the structure to which data refers. - * If the database supports duplicate keys (see \ref MDBX_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of \ref mdbx_cursor_get(). - * - * \note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a `SIGSEGV`. - * - * \note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in,out] data The data corresponding to the key. - * \param [out] pattr The pointer to retrieve attribute. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr); -/** end of nexenta @} */ -#endif /* MDBX_NEXENTA_ATTRS */ - /** end of c_api @} */ -/******************************************************************************* - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -LIBMDBX_API int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n); -LIBMDBX_API int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2); -LIBMDBX_API int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n); -LIBMDBX_API size_t mdbx_e2k_strlen_bug_workaround(const char *s); -LIBMDBX_API size_t mdbx_e2k_strnlen_bug_workaround(const char *s, - size_t maxlen); -#ifdef __cplusplus -namespace std { -inline int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - return ::mdbx_e2k_memcmp_bug_workaround(s1, s2, n); -} -inline int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - return ::mdbx_e2k_strcmp_bug_workaround(s1, s2); -} -inline int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - return ::mdbx_e2k_strncmp_bug_workaround(s1, s2, n); -} -inline size_t mdbx_e2k_strlen_bug_workaround(const char *s) { - return ::mdbx_e2k_strlen_bug_workaround(s); -} -inline size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - return ::mdbx_e2k_strnlen_bug_workaround(s, maxlen); -} -} // namespace std -#endif /* __cplusplus */ - -#include -#include -#undef memcmp -#define memcmp mdbx_e2k_memcmp_bug_workaround -#undef bcmp -#define bcmp mdbx_e2k_memcmp_bug_workaround -#undef strcmp -#define strcmp mdbx_e2k_strcmp_bug_workaround -#undef strncmp -#define strncmp mdbx_e2k_strncmp_bug_workaround -#undef strlen -#define strlen mdbx_e2k_strlen_bug_workaround -#undef strnlen -#define strnlen mdbx_e2k_strnlen_bug_workaround -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ index 6a727f3e8..623b4cc21 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ @@ -965,20 +965,21 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { hash_value() const noexcept; /// \brief Three-way fast non-lexicographically length-based comparison. - /// \return value: - /// == 0 if "a" == "b", - /// < 0 if "a" shorter than "b", - /// > 0 if "a" longer than "b", - /// < 0 if "a" length-equal and lexicographically less than "b", - /// > 0 if "a" length-equal and lexicographically great than "b". + /// \details Firstly compares length and if it equal then compare content + /// lexicographically. \return value: + /// `== 0` if `a` the same as `b`; + /// `< 0` if `a` shorter than `b`, + /// or the same length and lexicographically less than `b`; + /// `> 0` if `a` longer than `b`, + /// or the same length and lexicographically great than `b`. MDBX_NOTHROW_PURE_FUNCTION static MDBX_CXX14_CONSTEXPR intptr_t compare_fast(const slice &a, const slice &b) noexcept; /// \brief Three-way lexicographically comparison. /// \return value: - /// < 0 if "a" < "b", - /// == 0 if "a" == "b", - /// > 0 if "a" > "b". + /// `== 0` if `a` lexicographically equal `b`; + /// `< 0` if `a` lexicographically less than `b`; + /// `> 0` if `a` lexicographically great than `b`. MDBX_NOTHROW_PURE_FUNCTION static MDBX_CXX14_CONSTEXPR intptr_t compare_lexicographically(const slice &a, const slice &b) noexcept; friend MDBX_CXX14_CONSTEXPR bool operator==(const slice &a, @@ -3224,9 +3225,13 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env ©(const ::std::wstring &destination, bool compactify, bool force_dynamic_size = false); + env ©(const wchar_t *destination, bool compactify, + bool force_dynamic_size = false); #endif /* Windows */ env ©(const ::std::string &destination, bool compactify, bool force_dynamic_size = false); + env ©(const char *destination, bool compactify, + bool force_dynamic_size = false); /// \brief Copy an environment to the specified file descriptor. env ©(filehandle fd, bool compactify, bool force_dynamic_size = false); @@ -3251,14 +3256,18 @@ public: /// \brief Removes the environment's files in a proper and multiprocess-safe /// way. #ifdef MDBX_STD_FILESYSTEM_PATH - static bool remove(const MDBX_STD_FILESYSTEM_PATH &, + static bool remove(const MDBX_STD_FILESYSTEM_PATH &pathname, const remove_mode mode = just_remove); #endif /* MDBX_STD_FILESYSTEM_PATH */ #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) - static bool remove(const ::std::wstring &, + static bool remove(const ::std::wstring &pathname, + const remove_mode mode = just_remove); + static bool remove(const wchar_t *pathname, const remove_mode mode = just_remove); #endif /* Windows */ - static bool remove(const ::std::string &, + static bool remove(const ::std::string &pathname, + const remove_mode mode = just_remove); + static bool remove(const char *pathname, const remove_mode mode = just_remove); /// \brief Statistics for a database in the MDBX environment. @@ -3496,15 +3505,19 @@ public: /// \brief Open existing database. #ifdef MDBX_STD_FILESYSTEM_PATH - env_managed(const MDBX_STD_FILESYSTEM_PATH &, const operate_parameters &, - bool accede = true); + env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, + const operate_parameters &, bool accede = true); #endif /* MDBX_STD_FILESYSTEM_PATH */ #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) - env_managed(const ::std::wstring &, const operate_parameters &, + env_managed(const ::std::wstring &pathname, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const operate_parameters &, + bool accede = true); #endif /* Windows */ - env_managed(const ::std::string &, const operate_parameters &, + env_managed(const ::std::string &pathname, const operate_parameters &, bool accede = true); + explicit env_managed(const char *pathname, const operate_parameters &, + bool accede = true); /// \brief Additional parameters for creating a new database. struct create_parameters { @@ -3517,15 +3530,20 @@ public: /// \brief Create new or open existing database. #ifdef MDBX_STD_FILESYSTEM_PATH - env_managed(const MDBX_STD_FILESYSTEM_PATH &, const create_parameters &, - const operate_parameters &, bool accede = true); + env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, + const create_parameters &, const operate_parameters &, + bool accede = true); #endif /* MDBX_STD_FILESYSTEM_PATH */ #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) - env_managed(const ::std::wstring &, const create_parameters &, + env_managed(const ::std::wstring &pathname, const create_parameters &, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const create_parameters &, + const operate_parameters &, bool accede = true); #endif /* Windows */ - env_managed(const ::std::string &, const create_parameters &, + env_managed(const ::std::string &pathname, const create_parameters &, const operate_parameters &, bool accede = true); + explicit env_managed(const char *pathname, const create_parameters &, + const operate_parameters &, bool accede = true); /// \brief Explicitly closes the environment and release the memory map. /// diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c index 81d564dc9..c98a9c65e 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -310,11 +310,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -458,8 +459,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -527,18 +529,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -700,6 +734,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -719,6 +755,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -763,6 +801,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -917,6 +978,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -980,7 +1051,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1000,7 +1071,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1038,8 +1109,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1047,8 +1118,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1057,36 +1128,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1098,23 +1169,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1132,7 +1206,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1142,7 +1216,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1155,7 +1235,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1163,13 +1243,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1179,12 +1259,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1195,8 +1275,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1206,15 +1286,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1223,57 +1303,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1282,25 +1362,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1308,17 +1389,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1328,7 +1410,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1341,24 +1423,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1374,7 +1455,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1395,7 +1476,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1403,14 +1484,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1423,14 +1504,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1445,15 +1526,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1463,14 +1544,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1707,6 +1802,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1716,11 +1823,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1979,14 +2086,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1995,6 +2099,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2063,8 +2171,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2120,15 +2228,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2163,26 +2271,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2290,7 +2398,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2309,11 +2420,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2356,21 +2470,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2387,6 +2504,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2406,16 +2531,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2423,17 +2551,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2550,7 +2678,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2576,7 +2704,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2683,6 +2811,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2734,6 +2863,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2745,7 +2883,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2768,9 +2906,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2780,7 +2918,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2807,6 +2945,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2831,11 +2970,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2876,8 +3015,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2887,18 +3026,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2931,13 +3079,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2948,18 +3098,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3008,7 +3158,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3017,11 +3167,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3056,142 +3206,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3216,15 +3362,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3286,8 +3432,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3430,7 +3574,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3455,15 +3600,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) @@ -3623,7 +3768,7 @@ struct { #define dbi_main walk.dbi[MAIN_DBI] #define dbi_meta walk.dbi[CORE_DBS] -int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE; +int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; MDBX_env *env; MDBX_txn *txn; MDBX_envinfo envinfo; @@ -3655,7 +3800,8 @@ static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { } } -static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { +static void va_log(MDBX_log_level_t level, const char *function, int line, + const char *msg, va_list args) { static const char *const prefixes[] = { "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, " " /* notice */, " // " /* verbose */, " //// " /* debug */, @@ -3673,13 +3819,20 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { fflush(nullptr); fputs(prefixes[level], out); vfprintf(out, msg, args); - if (msg[strlen(msg) - 1] != '\n') + + const bool have_lf = msg[strlen(msg) - 1] == '\n'; + if (level == MDBX_LOG_FATAL && function && line) + fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", + function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); + else if (!have_lf) fputc('\n', out); fflush(nullptr); } if (level == MDBX_LOG_FATAL) { +#if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS exit(EXIT_FAILURE_MDBX); +#endif abort(); } } @@ -3687,7 +3840,7 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { va_list args; va_start(args, msg); - va_log(MDBX_LOG_ERROR, msg, args); + va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); va_end(args); } @@ -3696,7 +3849,7 @@ static void logger(MDBX_log_level_t level, const char *function, int line, (void)line; (void)function; if (level < MDBX_LOG_EXTRA) - va_log(level, msg, args); + va_log(level, function, line, msg, args); } static int check_user_break(void) { @@ -3715,12 +3868,12 @@ static void pagemap_cleanup(void) { for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; i < ARRAY_LENGTH(walk.dbi); ++i) { if (walk.dbi[i].name) { - mdbx_free((void *)walk.dbi[i].name); + osal_free((void *)walk.dbi[i].name); walk.dbi[i].name = nullptr; } } - mdbx_free(walk.pagemap); + osal_free(walk.pagemap); walk.pagemap = nullptr; } @@ -3751,7 +3904,7 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (dbi == ARRAY_END(walk.dbi)) return nullptr; - dbi->name = mdbx_strdup(dbi_name); + dbi->name = osal_strdup(dbi_name); return last = dbi; } @@ -3769,7 +3922,7 @@ static void MDBX_PRINTF_ARGS(4, 5) break; if (!p) { - p = mdbx_calloc(1, sizeof(*p)); + p = osal_calloc(1, sizeof(*p)); if (unlikely(!p)) return; p->caption = msg; @@ -3814,7 +3967,7 @@ static size_t problems_pop(struct problem *list) { count += problems_list->count; print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, problems_list->count); - mdbx_free(problems_list); + osal_free(problems_list); problems_list = p; } print("\n"); @@ -4051,7 +4204,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, number = data->iov_len / sizeof(pgno_t) - 1; } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= /* LY: allow gap up to one page. it is ok - * and better than shink-and-retry inside mdbx_update_gc() */ + * and better than shink-and-retry inside update_gc() */ envinfo.mi_dxb_pagesize) problem_add("entry", txnid, "extra idl space", "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", @@ -4148,7 +4301,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, return handle_userdb(record_number, key, data); } - name = mdbx_malloc(key->iov_len + 1); + name = osal_malloc(key->iov_len + 1); if (unlikely(!name)) return MDBX_ENOMEM; memcpy(name, key->iov_base, key->iov_len); @@ -4156,7 +4309,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, userdb_count++; rc = process_db(~0u, name, handle_userdb, false); - mdbx_free(name); + osal_free(name); if (rc != MDBX_INCOMPATIBLE) return rc; @@ -4335,9 +4488,9 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, } if (ignore_wrong_order) { /* for debugging with enabled assertions */ - mc->mc_flags |= C_SKIPORD; + mc->mc_checking |= CC_SKIPORD; if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); @@ -4761,7 +4914,9 @@ int main(int argc, char *argv[]) { mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) ? (MDBX_log_level_t)(verbose + 1) : MDBX_LOG_TRACE, - MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, logger); + MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | + MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, + logger); rc = mdbx_env_create(&env); if (rc) { @@ -4860,7 +5015,7 @@ int main(int argc, char *argv[]) { } #endif if (rc) { - error("mdbx_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } @@ -5024,7 +5179,7 @@ int main(int argc, char *argv[]) { print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(nullptr); - walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); + walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); if (!walk.pagemap) { rc = errno ? errno : MDBX_ENOMEM; error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c index a784bf6a4..38a98614c 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -310,11 +310,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -458,8 +459,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -527,18 +529,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -700,6 +734,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -719,6 +755,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -763,6 +801,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -917,6 +978,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -980,7 +1051,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1000,7 +1071,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1038,8 +1109,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1047,8 +1118,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1057,36 +1128,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1098,23 +1169,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1132,7 +1206,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1142,7 +1216,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1155,7 +1235,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1163,13 +1243,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1179,12 +1259,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1195,8 +1275,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1206,15 +1286,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1223,57 +1303,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1282,25 +1362,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1308,17 +1389,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1328,7 +1410,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1341,24 +1423,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1374,7 +1455,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1395,7 +1476,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1403,14 +1484,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1423,14 +1504,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1445,15 +1526,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1463,14 +1544,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1707,6 +1802,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1716,11 +1823,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1979,14 +2086,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1995,6 +2099,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2063,8 +2171,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2120,15 +2228,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2163,26 +2271,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2290,7 +2398,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2309,11 +2420,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2356,21 +2470,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2387,6 +2504,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2406,16 +2531,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2423,17 +2551,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2550,7 +2678,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2576,7 +2704,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2683,6 +2811,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2734,6 +2863,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2745,7 +2883,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2768,9 +2906,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2780,7 +2918,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2807,6 +2945,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2831,11 +2970,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2876,8 +3015,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2887,18 +3026,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2931,13 +3079,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2948,18 +3098,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3008,7 +3158,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3017,11 +3167,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3056,142 +3206,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3216,15 +3362,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3286,8 +3432,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3430,7 +3574,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3455,15 +3600,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c index a0a5f28e2..1c0e267bd 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c @@ -22,7 +22,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -36,7 +36,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -312,11 +312,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -460,8 +461,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -529,18 +531,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -702,6 +736,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -721,6 +757,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -765,6 +803,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -919,6 +980,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -982,7 +1053,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1002,7 +1073,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1040,8 +1111,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1049,8 +1120,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1059,36 +1130,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1100,23 +1171,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1134,7 +1208,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1144,7 +1218,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1157,7 +1237,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1165,13 +1245,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1181,12 +1261,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1197,8 +1277,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1208,15 +1288,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1225,57 +1305,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1284,25 +1364,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1310,17 +1391,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1330,7 +1412,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1343,24 +1425,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1376,7 +1457,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1397,7 +1478,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1405,14 +1486,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1425,14 +1506,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1447,15 +1528,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1465,14 +1546,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1709,6 +1804,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1718,11 +1825,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1981,14 +2088,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1997,6 +2101,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2065,8 +2173,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2122,15 +2230,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2165,26 +2273,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2292,7 +2400,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2311,11 +2422,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2358,21 +2472,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2389,6 +2506,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2408,16 +2533,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2425,17 +2553,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2552,7 +2680,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2578,7 +2706,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2685,6 +2813,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2736,6 +2865,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2747,7 +2885,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2770,9 +2908,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2782,7 +2920,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2809,6 +2947,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2833,11 +2972,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2878,8 +3017,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2889,18 +3028,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2933,13 +3081,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2950,18 +3100,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3010,7 +3160,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3019,11 +3169,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3058,142 +3208,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3218,15 +3364,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3288,8 +3434,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3432,7 +3576,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3457,15 +3602,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c index cdb58a69b..cd678bc88 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -310,11 +310,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -458,8 +459,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -527,18 +529,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -700,6 +734,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -719,6 +755,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -763,6 +801,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -917,6 +978,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -980,7 +1051,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1000,7 +1071,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1038,8 +1109,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1047,8 +1118,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1057,36 +1128,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1098,23 +1169,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1132,7 +1206,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1142,7 +1216,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1155,7 +1235,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1163,13 +1243,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1179,12 +1259,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1195,8 +1275,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1206,15 +1286,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1223,57 +1303,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1282,25 +1362,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1308,17 +1389,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1328,7 +1410,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1341,24 +1423,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1374,7 +1455,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1395,7 +1476,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1403,14 +1484,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1423,14 +1504,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1445,15 +1526,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1463,14 +1544,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1707,6 +1802,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1716,11 +1823,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1979,14 +2086,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1995,6 +2099,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2063,8 +2171,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2120,15 +2228,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2163,26 +2271,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2290,7 +2398,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2309,11 +2420,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2356,21 +2470,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2387,6 +2504,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2406,16 +2531,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2423,17 +2551,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2550,7 +2678,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2576,7 +2704,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2683,6 +2811,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2734,6 +2863,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2745,7 +2883,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2768,9 +2906,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2780,7 +2918,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2807,6 +2945,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2831,11 +2970,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2876,8 +3015,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2887,18 +3026,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2931,13 +3079,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2948,18 +3098,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3008,7 +3158,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3017,11 +3167,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3056,142 +3206,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3216,15 +3362,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3286,8 +3432,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3430,7 +3574,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3455,15 +3600,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) @@ -3716,10 +3861,10 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { error("mdbx_cursor_open", rc); return rc; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == @@ -3886,7 +4031,9 @@ int main(int argc, char *argv[]) { rc = mdbx_env_open( env, envname, - envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE : MDBX_RDONLY), 0); + envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION + : MDBX_RDONLY), + 0); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_env_open", rc); goto env_close; @@ -3913,10 +4060,10 @@ int main(int argc, char *argv[]) { error("mdbx_cursor_open", rc); goto txn_abort; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } bool have_raw = false; @@ -3931,7 +4078,7 @@ int main(int argc, char *argv[]) { if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_realloc(buf4free, key.iov_len + 1); + subname = osal_realloc(buf4free, key.iov_len + 1); if (!subname) { rc = MDBX_ENOMEM; break; diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c index 9e9e11cda..62dfd9b4d 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -310,11 +310,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -458,8 +459,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -527,18 +529,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -700,6 +734,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -719,6 +755,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -763,6 +801,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -917,6 +978,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -980,7 +1051,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1000,7 +1071,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1038,8 +1109,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1047,8 +1118,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1057,36 +1128,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1098,23 +1169,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1132,7 +1206,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1142,7 +1216,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1155,7 +1235,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1163,13 +1243,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1179,12 +1259,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1195,8 +1275,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1206,15 +1286,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1223,57 +1303,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1282,25 +1362,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1308,17 +1389,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1328,7 +1410,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1341,24 +1423,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1374,7 +1455,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1395,7 +1476,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1403,14 +1484,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1423,14 +1504,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1445,15 +1526,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1463,14 +1544,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1707,6 +1802,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1716,11 +1823,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1979,14 +2086,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1995,6 +2099,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2063,8 +2171,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2120,15 +2228,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2163,26 +2271,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2290,7 +2398,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2309,11 +2420,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2356,21 +2470,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2387,6 +2504,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2406,16 +2531,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2423,17 +2551,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2550,7 +2678,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2576,7 +2704,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2683,6 +2811,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2734,6 +2863,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2745,7 +2883,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2768,9 +2906,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2780,7 +2918,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2807,6 +2945,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2831,11 +2970,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2876,8 +3015,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2887,18 +3026,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2931,13 +3079,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2948,18 +3098,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3008,7 +3158,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3017,11 +3167,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3056,142 +3206,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3216,15 +3362,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3286,8 +3432,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3430,7 +3574,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3455,15 +3600,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) @@ -3743,7 +3888,7 @@ static int readhdr(void) { if (str) { if (*str) { free(subname); - subname = mdbx_strdup(str); + subname = osal_strdup(str); if (!subname) { if (!quiet) perror("strdup()"); @@ -3951,7 +4096,7 @@ __hot static int readline(MDBX_val *out, MDBX_val *buf) { /* Is buffer too short? */ while (c1[len - 1] != '\n') { - buf->iov_base = mdbx_realloc(buf->iov_base, buf->iov_len * 2); + buf->iov_base = osal_realloc(buf->iov_base, buf->iov_len * 2); if (!buf->iov_base) { if (!quiet) fprintf(stderr, @@ -4090,7 +4235,7 @@ int main(int argc, char *argv[]) { envflags |= MDBX_NOSUBDIR; break; case 's': - subname = mdbx_strdup(optarg); + subname = osal_strdup(optarg); break; case 'N': putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; @@ -4136,7 +4281,7 @@ int main(int argc, char *argv[]) { fflush(nullptr); dbuf.iov_len = 4096; - dbuf.iov_base = mdbx_malloc(dbuf.iov_len); + dbuf.iov_base = osal_malloc(dbuf.iov_len); if (!dbuf.iov_base) { rc = MDBX_ENOMEM; error("value-buffer", rc); diff --git a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c index 462c4d52c..43116b383 100644 --- a/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c +++ b/crates/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* * Copyright 2015-2022 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e88c2083bb74c3b9e61253604256e2cd7d7c8bdb222d763e82b3b4abad7e4634_v0_11_8_0_gbd80e01e +#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -310,11 +310,12 @@ #define nullptr NULL #endif -#ifdef __APPLE__ +#if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) +#include +#include #ifndef MAC_OS_X_VERSION_MIN_REQUIRED #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ #endif -#include #endif /* Apple OSX & iOS */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ @@ -458,8 +459,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -527,18 +529,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -700,6 +734,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -719,6 +755,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else @@ -763,6 +801,29 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + +#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE +#ifdef WEAK_IMPORT_ATTRIBUTE +#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE +#elif __has_attribute(__weak__) && __has_attribute(__weak_import__) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) +#elif __has_attribute(__weak__) || \ + (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) +#define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) +#else +#define MDBX_WEAK_IMPORT_ATTRIBUTE +#endif +#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -917,6 +978,16 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -980,7 +1051,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1000,7 +1071,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1038,8 +1109,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -1047,8 +1118,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) #define __try @@ -1057,36 +1128,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -1098,23 +1169,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -1132,7 +1206,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -1142,7 +1216,13 @@ mdbx_syspagesize(void) { #endif } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -1155,7 +1235,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -1163,13 +1243,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -1179,12 +1259,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1195,8 +1275,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -1206,15 +1286,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1223,57 +1303,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -1282,25 +1362,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -1308,17 +1389,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1328,7 +1410,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1341,24 +1423,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -1374,7 +1455,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -1395,7 +1476,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -1403,14 +1484,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -1423,14 +1504,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -1445,15 +1526,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -1463,14 +1544,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +#define OSAL_MB2WIDE(FROM, TO) \ + do { \ + const char *const from_tmp = (FROM); \ + const size_t from_mblen = strlen(from_tmp); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ + if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + return ERROR_INVALID_NAME; \ + wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ + if (to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + return ERROR_INVALID_NAME; \ + (TO) = to_tmp; \ + } while (0) + +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -1707,6 +1802,18 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 @@ -1716,11 +1823,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -1979,14 +2086,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -1995,6 +2099,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif @@ -2063,8 +2171,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -2120,15 +2228,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -2163,26 +2271,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -2290,7 +2398,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -2309,11 +2420,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -2356,21 +2470,24 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -2387,6 +2504,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -2406,16 +2531,19 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -2423,17 +2551,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -2550,7 +2678,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -2576,7 +2704,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -2683,6 +2811,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2734,6 +2863,15 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -2745,7 +2883,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -2768,9 +2906,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -2780,7 +2918,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2807,6 +2945,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -2831,11 +2970,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; @@ -2876,8 +3015,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -2887,18 +3026,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -2931,13 +3079,15 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -2948,18 +3098,18 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ @@ -3008,7 +3158,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ @@ -3017,11 +3167,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -3056,142 +3206,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -3216,15 +3362,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -3286,8 +3432,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -3430,7 +3574,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -3455,15 +3600,15 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) @@ -3999,13 +4144,13 @@ int main(int argc, char *argv[]) { MDBX_dbi subdbi; if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_malloc(key.iov_len + 1); + subname = osal_malloc(key.iov_len + 1); memcpy(subname, key.iov_base, key.iov_len); subname[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &subdbi); if (rc == MDBX_SUCCESS) printf("Status of %s\n", subname); - mdbx_free(subname); + osal_free(subname); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_INCOMPATIBLE) continue; diff --git a/crates/libmdbx-rs/src/lib.rs b/crates/libmdbx-rs/src/lib.rs index 21670a75a..80ba3e370 100644 --- a/crates/libmdbx-rs/src/lib.rs +++ b/crates/libmdbx-rs/src/lib.rs @@ -6,8 +6,8 @@ pub use crate::{ cursor::{Cursor, Iter, IterDup}, database::Database, environment::{ - Environment, EnvironmentBuilder, EnvironmentKind, Geometry, Info, NoWriteMap, Stat, - WriteMap, + Environment, EnvironmentBuilder, EnvironmentKind, Geometry, Info, NoWriteMap, PageSize, + Stat, WriteMap, }, error::{Error, Result}, flags::*, @@ -42,10 +42,8 @@ mod test_utils { let env = { let mut builder = Environment::new(); builder.set_max_dbs(2); - builder.set_geometry(Geometry { - size: Some(1_000_000..1_000_000), - ..Default::default() - }); + builder + .set_geometry(Geometry { size: Some(1_000_000..1_000_000), ..Default::default() }); builder.open(dir.path()).expect("open mdbx env") }; @@ -53,11 +51,8 @@ mod test_utils { let mut value = [0u8; 8]; LittleEndian::write_u64(&mut value, height); let tx = env.begin_rw_txn().expect("begin_rw_txn"); - let index = tx - .create_db(None, DatabaseFlags::DUP_SORT) - .expect("open index db"); - tx.put(&index, &HEIGHT_KEY, &value, WriteFlags::empty()) - .expect("tx.put"); + let index = tx.create_db(None, DatabaseFlags::DUP_SORT).expect("open index db"); + tx.put(&index, &HEIGHT_KEY, &value, WriteFlags::empty()).expect("tx.put"); tx.commit().expect("tx.commit"); } } diff --git a/crates/libmdbx-rs/tests/cursor.rs b/crates/libmdbx-rs/tests/cursor.rs index f88346546..c2828fd96 100644 --- a/crates/libmdbx-rs/tests/cursor.rs +++ b/crates/libmdbx-rs/tests/cursor.rs @@ -1,8 +1,8 @@ -use libmdbx::*; +use reth_libmdbx::*; use std::borrow::Cow; use tempfile::tempdir; -type Environment = libmdbx::Environment; +type Environment = reth_libmdbx::Environment; #[test] fn test_get() { diff --git a/crates/libmdbx-rs/tests/environment.rs b/crates/libmdbx-rs/tests/environment.rs index 783a29cb5..ca3b4efc7 100644 --- a/crates/libmdbx-rs/tests/environment.rs +++ b/crates/libmdbx-rs/tests/environment.rs @@ -1,8 +1,8 @@ use byteorder::{ByteOrder, LittleEndian}; -use libmdbx::*; +use reth_libmdbx::*; use tempfile::tempdir; -type Environment = libmdbx::Environment; +type Environment = reth_libmdbx::Environment; #[test] fn test_open() { diff --git a/crates/libmdbx-rs/tests/transaction.rs b/crates/libmdbx-rs/tests/transaction.rs index f41903d33..9d59175cb 100644 --- a/crates/libmdbx-rs/tests/transaction.rs +++ b/crates/libmdbx-rs/tests/transaction.rs @@ -1,4 +1,4 @@ -use libmdbx::*; +use reth_libmdbx::*; use std::{ borrow::Cow, io::Write, @@ -7,7 +7,7 @@ use std::{ }; use tempfile::tempdir; -type Environment = libmdbx::Environment; +type Environment = reth_libmdbx::Environment; #[test] fn test_put_get_del() {