diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt index a5167d627..777a3f302 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/CMakeLists.txt @@ -1,5 +1,5 @@ ## -## Copyright 2020-2022 Leonid Yuriev +## Copyright 2020-2023 Leonid Yuriev ## and other libmdbx authors: please see AUTHORS file. ## All rights reserved. ## diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md index 7427de95f..a1b8321df 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/ChangeLog.md @@ -4,7 +4,294 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.2 (Иван Ярыгин) от 2022-11-11 + +## v0.12.6 "ЦСКА" от 2023-04-29 + +Стабилизирующий выпуск с исправлением обнаруженных ошибок и устранением +недочетов, в день 100-летнего юбилея спортивного клуба [«ЦСКА»](https://ru.wikipedia.org/wiki/Центральный_спортивный_клуб_Армии). + +``` +14 files changed, 117 insertions(+), 83 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Мелочи: + + - Обновление патча для старых версий buildroot. + - Использование clang-format-16. + - Использование `enum`-типов вместо `int` для устранения предупреждений GCC 13, + что могло ломать сборку в Fedora 38. + + +-------------------------------------------------------------------------------- + + +## v0.12.5 "Динамо" от 2023-04-18 + +Стабилизирующий выпуск с исправлением обнаруженных ошибок и устранением +недочетов, в день 100-летнего юбилея спортивного общества [«Динамо»](https://ru.wikipedia.org/wiki/Динамо_(спортивное_общество)). + +``` +16 files changed, 686 insertions(+), 247 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Благодарности: + + - Max за сообщение о проблеме экспорта из DSO/DLL + устаревших функций API. + - [`@calvin3721`](https://t.me/calvin3721) за сообщение о проблеме работы + `MainDB` с флагами не по-умолчанию. + +Исправления: + + - Поправлен экспорт из DSO/DLL устаревших функций, + которые заменены на inline в текущем API. + - Устранено использование неверного компаратора при создании или пересоздании + `MainDB` с флагами/опциями предполагающим использование специфического + компаратора (не по-умолчанию). + +Мелочи: + + - Удалена дублирующая диагностика внутри `node_read_bigdata()`. + - Исправлены ссылки в описании `mdbx_env_set_geometry()`. + - Добавлен отдельный тест `extra/upsert_alldups` для специфического + сценария замены/перезаписи одним значением всех multi-значений + соответствующих ключу, т.е. замена всех «дубликатов» одним значением. + - В C++ API добавлены варианты `buffer::key_from()` с явным именованием по типу данных. + - Добавлен отдельный тест `extra/maindb_ordinal` для специфического + сценария создания `MainDB` с флагами требующими использования + компаратора не по-умолчанию. + - Рефакторинг проверки "когерентности" мета-страниц. + - Корректировка `osal_vasprintf()` для устранения предупреждений статических анализаторов. + + +-------------------------------------------------------------------------------- + + +## v0.12.4 "Арта-333" от 2023-03-03 + +Стабилизирующий выпуск с исправлением обнаруженных ошибок, устранением +недочетов и технических долгов. Ветка 0.12 считается готовой к +продуктовому использованию, получает статус стабильной и далее будет +получать только исправление ошибок. Разработка будет продолжена в ветке +0.13, а ветка 0.11 становится архивной. + +``` +63 files changed, 1161 insertions(+), 569 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Благодарности: + + - Max за сообщение о проблеме ERROR_SHARING_VIOLATION + в режиме MDBX_EXCLUSIVE на Windows. + - Alisher Ashyrov за сообщение о проблеме + с assert-проверкой и содействие в отладке. + - Masatoshi Fukunaga за сообщение о проблеме + `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений в subDb. + +Исправления: + + - Устранен регресс после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf, + приводящий к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД + в режиме MDBX_EXCLUSIVE для чтения-записи. + + - Добавлено ограничение размера отображения при коротком read-only файле, для + предотвращения ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая возникает + в этом случае и совсем не информативна для пользователя. + + - Произведен рефакторинг `dxb_resize()`, в том числе, для устранения срабатывания + assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических + многопоточных сценариях использования. Проверка срабатывала только в + отладочных сборках, при специфическом наложении во времени читающей и + пишущей транзакции в разных потоках, одновременно с изменением размера БД. + Кроме срабатывание проверки, каких-либо других последствий не возникало. + + - Устранена проблема в `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены + всех значений единственного ключа в subDb. В ходе этой операции subDb + становится полностью пустой, без каких-либо страниц и именно эта + ситуация не была учтена в коде, что приводило к повреждению БД + при фиксации такой транзакции. + + - Устранена излишняя assert-проверка внутри `override_meta()`. + Что в отладочных сборках могло приводить к ложным срабатываниям + при восстановлении БД, в том числе при автоматическом откате слабых + мета-страниц. + + - Скорректированы макросы `__cold`/`__hot`, в том числе для устранения проблемы + `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` + при сборке посредством GCC >10.x для SH4. + +Ликвидация технических долгов и мелочи: + + - Исправлены многочисленные опечатки в документации. + - Доработан тест для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. + - Расширены сценарии запуска `mdbx_chk` из CMake-тестов для проверки как в обычном, + так и эксклюзивном режимах чтения-записи. + - Уточнены спецификаторы `const` и `noexcept` для нескольких методов в C++ API. + - Устранено использование стека под буферы для `wchar`-преобразования путей. + - Для Windows добавлена функция `mdbx_env_get_path()` для получения пути к БД + в формате многобайтных символов. + - Добавлены doxygen-описания для API с широкими символами. + - Устранены предупреждения статического анализатора MSVC, + все они были несущественные, либо ложные. + - Устранено ложное предупреждение GCC при сборке для SH4. + - Добавлена поддержка ASAN (Address Sanitizer) при сборке посредством MSVC. + - Расширен набор перебираемых режимов в скрипте `test/long_stochastic.sh`, + добавлена опция `--extra`. + - В C++ API добавлена поддержка расширенных опций времени выполнения `mdbx::extra_runtime_option`, + аналогично `enum MDBX_option_t` из C API. + - Вывод всех счетчиков page-operations в `mdbx_stat`. + + +-------------------------------------------------------------------------------- + + +## v0.12.3 "Акула" от 2023-01-07 + +Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source +[проекте "Акула"](https://erigon.substack.com/p/winding-down-support-for-akula-project). + +Добавлена prefault-запись, переделан контроль “некогерентности” unified page/buffer cache, изменена тактика слияния страниц и т.д. +Стало ещё быстрее, в некоторых сценариях вдвое. + +``` +20 files changed, 4508 insertions(+), 2928 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Благодарности: + + - [Alex Sharov](https://t.me/AskAlexSharov) и команде [Erigon](https://github.com/ledgerwatch/erigon) за тестирование. + - [Simon Leier](https://t.me/leisim) за сообщение о сбоях и тестирование. + +Новое: + + - Использование адреса [https://libmdbx.dqdkfa.ru/dead-github](https://libmdbx.dqdkfa.ru/dead-github) + для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. + + - Реализована prefault-запись при выделении страниц для read-write отображений. + Это приводит к кратному снижению системных издержек и существенному увеличению + производительности в соответствующих сценариях использования, когда: + - размер БД и объём данных существенно больше ОЗУ; + - используется режим `MDBX_WRITEMAP`; + - не-мелкие транзакции (по ходу транзакции выделяется многие сотни или тысячи страниц). + + В режиме `MDBX_WRITEMAP` выделение/переиспользование страниц приводит + к page-fault и чтению страницы с диска, даже если содержимое страницы + не нужно (будет перезаписано). Это является следствием работы подсистемы + виртуальной памяти, а штатный способ лечения через `MADV_REMOVE` + работает не на всех ФС и обычно дороже получаемой экономии. + + Теперь в libmdbx используется "упреждающая запись" таких страниц, + которая на системах с [unified page cache](https://www.opennet.ru/base/dev/ubc.txt.html) + приводит к "вталкиванию" данных, устраняя необходимость чтения с диска при + обращении к такой странице памяти. + + Новый функционал работает в согласованности с автоматическим управлением read-ahead + и кэшем статуса присутствия страниц в ОЗУ, посредством [mincore()](https://man7.org/linux/man-pages/man2/mincore.2.html). + + - Добавлена опция `MDBX_opt_prefault_write_enable` для возможности принудительного + включения/выключения prefault-записи. + + - Реализован динамический выбор между сквозной записью на диск и обычной записью + с последующим [fdatasync()](https://man7.org/linux/man-pages/man3/fdatasync.3p.html) + управляемый опцией `MDBX_opt_writethrough_threshold`. + + В долговечных (durable) режимах данные на диск могут быть сброшены двумя способами: + - сквозной записью через файловый дескриптор открытый с `O_DSYNC`; + - обычной записью с последующим вызовом `fdatasync()`. + + Первый способ выгоднее при записи малого количества страниц и/или если + канал взаимодействия с диском/носителем имеет близкую к нулю задержку. + Второй способ выгоднее если требуется записать много страниц и/или канал + взаимодействия имеет весомую задержку (датацентры, облака). Добавленная + опция `MDBX_opt_writethrough_threshold` позволяет во время выполнения + задать порог для динамического выбора способа записи в зависимости от + объема и конкретных условия использования. + + - Автоматическая установка `MDBX_opt_rp_augment_limit` в зависимости от размера БД. + + - Запрещение разного режима `MDBX_WRITEMAP` между процессами в режимах + с отложенной/ленивой записью, так как в этом случае невозможно + обеспечить сброс данных на диск во всех случаях на всех поддерживаемых платформах. + + - Добавлена опция сборки `MDBX_MMAP_USE_MS_ASYNC` позволяющая отключить + использование системного вызова `msync(MS_ASYNC)`, в использовании + которого нет необходимости на подавляющем большинстве актуальных ОС. + По-умолчанию `MDBX_MMAP_USE_MS_ASYNC=0` (выключено) на Linux и других + системах с unified page cache. Такое поведение (без использования + `msync(MS_ASYNC)`) соответствует неизменяемой (hardcoded) логике LMDB. В + результате, в простых/наивных бенчмарках, libmdbx опережает LMDB + примерно также как при реальном применении. + + На всякий случай стоит еще раз отметить/напомнить, что на Windows + предположительно libmdbx будет отставать от LMDB в сценариях с + множеством мелких транзакций, так как libmdbx осознанно использует на + Windows файловые блокировки, которые медленные (плохо реализованы в ядре + ОС), но позволяют застраховать пользователей от массы неверных действий + приводящих к повреждению БД. + + - Поддержка не-печатных имен для subDb. + + - Добавлен явный выбор `tls_model("local-dynamic")` для обхода проблемы + `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` + из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. + + - Изменение тактики слияния страниц при удалении. + Теперь слияние выполняется преимущественно с уже измененной/грязной страницей. + Если же справа и слева обе страницы с одинаковым статусом, + то с наименее заполненной, как прежде. В сценариях с массивным удалением + это позволяет увеличить производительность до 50%. + + - Добавлен контроль отсутствия LCK-файлов с альтернативным именованием. + +Исправления (без корректировок новых функций): + + - Изменение размера отображения если это требуется для сброса данных на + диск при вызове `mdbx_env_sync()` из параллельного потока выполнения вне + работающей транзакции. + + - Исправление регресса после коммита db72763de049d6e4546f838277fe83b9081ad1de от 2022-10-08 + в логике возврата грязных страниц в режиме `MDBX_WRITEMAP`, из-за чего + освободившиеся страницы использовались не немедленно, а попадали в + retired-список совершаемой транзакции и происходил необоснованный рост + размера транзакции. + + - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях + повторного открытия среды посредством `mdbx_env_open()`. + + - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, + в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. + Проблема существует только в релизе 0.12.2. + + - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` + актуальной информации об объеме изменений в процессе транзакций чтения-записи. + + - Исправление несущественной опечатки в условиях `#if` определения порядка байт. + + - Исправление сборки для случая `MDBX_PNL_ASCENDING=1`. + +Ликвидация технических долгов и мелочи: + + - Доработка поддержки авто-слияния записей GC внутри `page_alloc_slowpath()`. + - Устранение несущественных предупреждений Coverity. + - Использование единого курсора для поиска в GC. + - Переработка внутренних флагов связанных с выделением страниц из GC. + - Доработка подготовки резерва перед обновлением GC при включенном BigFoot. + - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. + - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. + - Ускорение работы `mdbx_chk` при обработке пользовательских записей в `@MAIN`. + - Переработка LRU-отметок для спиллинга. + - Переработка контроля "некогерентности" Unified page cache для уменьшения накладных расходов. + - Рефакторинг и микрооптимизация. + + +-------------------------------------------------------------------------------- + + +## v0.12.2 "Иван Ярыгин" от 2022-11-11 Выпуск с существенными доработками и новой функциональностью в память о российском борце [Иване Сергеевиче Ярыгине](https://ru.wikipedia.org/wiki/Ярыгин,_Иван_Сергеевич). @@ -152,7 +439,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) Мелочи: - Исторические ссылки cвязанные с удалённым на ~~github~~ проектом перенаправлены на [web.archive.org](https://web.archive.org/web/https://github.com/erthink/libmdbx). - - Синхронизированны конструкции CMake между проектами. + - Синхронизированы конструкции CMake между проектами. - Добавлено предупреждение о небезопасности RISC-V. - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. - Добавлено обходное решение для минимизации ложно-положительных @@ -171,10 +458,10 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Fixed minor MingGW warning. -------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- -## v0.12.1 (Positive Proxima) at 2022-08-24 +## v0.12.1 "Positive Proxima" at 2022-08-24 The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). @@ -216,10 +503,54 @@ Fixes: Not a release but preparation for changing feature set and API. -------------------------------------------------------------------------------- +******************************************************************************** -## v0.11.13 at (Swashplate) 2022-11-10 +## v0.11.14 "Sergey Kapitsa" at 2023-02-14 + +The stable bugfix release in memory of [Sergey Kapitsa](https://en.wikipedia.org/wiki/Sergey_Kapitsa) on his 95th birthday. + +``` +22 files changed, 250 insertions(+), 174 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Fixes: + - backport: Fixed insignificant typo of `||` inside `#if` byte-order condition. + - backport: Fixed `SIGSEGV` or an erroneous call to `free()` in situations where + errors occur when reopening by `mdbx_env_open()` of a previously used + environment. + - backport: Fixed `cursor_put_nochecklen()` internals for case when dupsort'ed named subDb + contains a single key with multiple values (aka duplicates), which are replaced + with a single value by put-operation with the `MDBX_UPSERT+MDBX_ALLDUPS` flags. + In this case, the database becomes completely empty, without any pages. + However exactly this condition was not considered and thus wasn't handled correctly. + See [issue#8](https://gitflic.ru/project/erthink/libmdbx/issue/8) for more information. + - backport: Fixed extra assertion inside `override_meta()`, which could + lead to false-positive failing of the assertion in a debug builds during + DB recovery and auto-rollback. + - backport: Refined the `__cold`/`__hot` macros to avoid the + `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` + issue during build using GCC >10.x for SH4 arch. + +Minors: + + - backport: Using the https://libmdbx.dqdkfa.ru/dead-github + for resources deleted by the Github' administration. + - backport: Fixed English typos. + - backport: Fixed proto of `__asan_default_options()`. + - backport: Fixed doxygen-description of C++ API, especially of C++20 concepts. + - backport: Refined `const` and `noexcept` for few C++ API methods. + - backport: Fixed copy&paste typo of "Getting started". + - backport: Update MithrilDB status. + - backport: Resolve false-posirive `used uninitialized` warning from GCC >10.x + while build for SH4 arch. + + +-------------------------------------------------------------------------------- + + +## v0.11.13 at "Swashplate" 2022-11-10 The stable bugfix release in memory of [Boris Yuryev](https://ru.wikipedia.org/wiki/Юрьев,_Борис_Николаевич) on his 133rd birthday. @@ -247,7 +578,10 @@ Minors: - Use `--dont-check-ram-size` for small-tests make-targets (backport). -## v0.11.12 (Эребуни) at 2022-10-12 +-------------------------------------------------------------------------------- + + +## v0.11.12 "Эребуни" at 2022-10-12 The stable bugfix release. @@ -269,7 +603,10 @@ Minors: - Removed needless `LockFileEx()` inside `mdbx_env_copy()` (backport). -## v0.11.11 (Тендра-1790) at 2022-09-11 +-------------------------------------------------------------------------------- + + +## v0.11.11 "Тендра-1790" at 2022-09-11 The stable bugfix release. @@ -285,7 +622,10 @@ Fixes: - Fixed derived C++ builds by removing `MDBX_INTERNAL_FUNC` for `mdbx_w2mb()` and `mdbx_mb2w()`. -## v0.11.10 (the TriColor) at 2022-08-22 +-------------------------------------------------------------------------------- + + +## v0.11.10 "the TriColor" at 2022-08-22 The stable bugfix release. @@ -315,8 +655,10 @@ Minors: - Minor clarified `iov_page()` failure case. +-------------------------------------------------------------------------------- -## v0.11.9 (Чирчик-1992) at 2022-08-02 + +## v0.11.9 "Чирчик-1992" at 2022-08-02 The stable bugfix release. @@ -325,7 +667,7 @@ The stable bugfix release. Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` -Acknowledgements: +Acknowledgments: - [Alex Sharov](https://github.com/AskAlexSharov) and Erigon team for reporting and testing. - [Andrew Ashikhmin](https://gitflic.ru/user/yperbasis) for contributing. @@ -354,14 +696,14 @@ Minors: - Fixed `has no symbols` warning from Apple's ranlib. -------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- -## v0.11.8 (Baked Apple) at 2022-06-12 +## v0.11.8 "Baked Apple" at 2022-06-12 The stable release with an important fixes and workaround for the critical macOS thread-local-storage issue. -Acknowledgements: +Acknowledgments: - [Masatoshi Fukunaga](https://github.com/mah0x211) for [Lua bindings](https://github.com/mah0x211/lua-libmdbx). @@ -407,10 +749,10 @@ Minors: - Don't provide nor report package information if used as a CMake subproject. -------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- -## v0.11.7 (Resurrected Sarmat) at 2022-04-22 +## v0.11.7 "Resurrected Sarmat" at 2022-04-22 The stable risen release after the Github's intentional malicious disaster. @@ -456,7 +798,7 @@ Minors: - Switched to using `MDBX_EPERM` instead of `MDBX_RESULT_TRUE` to indicate that the geometry cannot be updated. - Added `NULL` checking during memory allocation inside `mdbx_chk`. - Resolved all warnings from MinGW while used without CMake. - - Added inheretable `target_include_directories()` to `CMakeLists.txt` for easy integration. + - Added inheritable `target_include_directories()` to `CMakeLists.txt` for easy integration. - Added build-time checks and paranoid runtime assertions for the `off_t` arguments of `fcntl()` which are used for locking. - Added `-Wno-lto-type-mismatch` to avoid false-positive warnings from old GCC during LTO-enabled builds. - Added checking for TID (system thread id) to avoid hang on 32-bit Bionic/Android within `pthread_mutex_lock()`. @@ -464,24 +806,24 @@ Minors: - Added `CMAKE_HOST_ARCH` and `CMAKE_HOST_CAN_RUN_EXECUTABLES_BUILT_FOR_TARGET`. -------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- ## v0.11.6 at 2022-03-24 The stable release with the complete workaround for an incoherence flaw of Linux unified page/buffer cache. Nonetheless the cause for this trouble may be an issue of Intel CPU cache/MESI. -See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for more information. -Acknowledgements: +Acknowledgments: - [David Bouyssié](https://github.com/david-bouyssie) for [Scala bindings](https://github.com/david-bouyssie/mdbx4s). - [Michelangelo Riccobene](https://github.com/mriccobene) for reporting and testing. Fixes: - - [Added complete workaround](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions. + - [Added complete workaround](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for an incoherence flaw of Linux unified page/buffer cache. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/272) cursor reusing for read-only transactions. - Fixed copy&paste typo inside `mdbx::cursor::find_multivalue()`. Minors: @@ -493,12 +835,15 @@ Minors: - Clarified error messages of a signature/version mismatch. +-------------------------------------------------------------------------------- + + ## v0.11.5 at 2022-02-23 The release with the temporary hotfix for a flaw of Linux unified page/buffer cache. -See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for more information. -Acknowledgements: +Acknowledgments: - [Simon Leier](https://github.com/leisim) for reporting and testing. - [Kai Wetlesen](https://github.com/kaiwetlesen) for [RPMs](http://copr.fedorainfracloud.org/coprs/kwetlesen/libmdbx/). @@ -506,10 +851,10 @@ Acknowledgements: Fixes: - - [Added hotfix](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache. - - [Fixed/Reworked](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API. + - [Added hotfix](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for a flaw of Linux unified page/buffer cache. + - [Fixed/Reworked](https://libmdbx.dqdkfa.ru/dead-github/pull/270) move-assignment operators for "managed" classes of C++ API. - Fixed potential `SIGSEGV` while open DB with overrided non-default page size. - - [Made](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases. + - [Made](https://libmdbx.dqdkfa.ru/dead-github/issues/267) `mdbx_env_open()` idempotence in failure cases. - Refined/Fixed pages reservation inside `mdbx_update_gc()` to avoid non-reclamation in a rare cases. - Fixed typo in a retained space calculation for the hsr-callback. @@ -522,11 +867,14 @@ Minors: - Minor fixes Doxygen references, comments, descriptions, etc. +-------------------------------------------------------------------------------- + + ## v0.11.4 at 2022-02-02 The stable release with fixes for large and huge databases sized of 4..128 TiB. -Acknowledgements: +Acknowledgments: - [Ledgerwatch](https://github.com/ledgerwatch), [Binance](https://github.com/binance-chain) and [Positive Technologies](https://www.ptsecurity.com/) teams for reporting, assistance in investigation and testing. - [Alex Sharov](https://github.com/AskAlexSharov) for reporting, testing and provide resources for remote debugging/investigation. @@ -542,15 +890,15 @@ New features, extensions and improvements: Fixes: - Fixed handling `MDBX_opt_rp_augment_limit` for GC's records from huge transactions (Erigon/Akula/Ethereum). - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`). - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/258) build on Android (avoid including `sys/sem.h`). + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/pull/261) missing copy assignment operator for `mdbx::move_result`. - Fixed missing `&` for `std::ostream &operator<<()` overloads. - Fixed unexpected `EXDEV` (Cross-device link) error from `mdbx_env_copy()`. - Fixed base64 encoding/decoding bugs in auxillary C++ API. - Fixed overflow of `pgno_t` during checking PNL on 64-bit platforms. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/260) excessive PNL checking after sort for spilling. - Reworked checking `MAX_PAGENO` and DB upper-size geometry limit. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/265) build for some combinations of versions of MSVC and Windows SDK. Minors: @@ -567,9 +915,12 @@ Minors: - Using the `-fno-semantic interposition` option to reduce the overhead to calling self own public functions. +-------------------------------------------------------------------------------- + + ## v0.11.3 at 2021-12-31 -Acknowledgements: +Acknowledgments: - [gcxfd ](https://github.com/gcxfd) for reporting, contributing and testing. - [장세연 (Чан Се Ен)](https://github.com/sasgas) for reporting and testing. @@ -577,10 +928,10 @@ Acknowledgements: New features, extensions and improvements: - - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`. - - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`. + - [Added](https://libmdbx.dqdkfa.ru/dead-github/issues/236) `mdbx_cursor_get_batch()`. + - [Added](https://libmdbx.dqdkfa.ru/dead-github/issues/250) `MDBX_SET_UPPERBOUND`. - C++ API is finalized now. - - The GC update stage has been [significantly speeded](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). + - The GC update stage has been [significantly speeded](https://libmdbx.dqdkfa.ru/dead-github/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). Fixes: @@ -591,18 +942,21 @@ Minors: - Fixed returning `MDBX_RESULT_TRUE` (unexpected -1) from `mdbx_env_set_option()`. - Added `mdbx_env_get_syncbytes()` and `mdbx_env_get_syncperiod()`. - - [Clarified](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`. + - [Clarified](https://libmdbx.dqdkfa.ru/dead-github/pull/249) description of `MDBX_INTEGERKEY`. - Reworked/simplified `mdbx_env_sync_internal()`. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. - Avoiding extra looping inside `mdbx_env_info_ex()`. - Explicitly enabled core dumps from stochastic tests scripts on Linux. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. - For compatibility reverted returning `MDBX_ENODATA`for some cases. +-------------------------------------------------------------------------------- + + ## v0.11.2 at 2021-12-02 -Acknowledgements: +Acknowledgments: - [장세연 (Чан Се Ен)](https://github.com/sasgas) for contributing to C++ API. - [Alain Picard](https://github.com/castortech) for [Java bindings](https://github.com/castortech/mdbxjni). @@ -612,10 +966,10 @@ Acknowledgements: Fixes: - - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7. - - [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page. - - [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. - - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers. + - [Fixed compilation](https://libmdbx.dqdkfa.ru/dead-github/pull/239) with `devtoolset-9` on CentOS/RHEL 7. + - [Fixed unexpected `MDBX_PROBLEM` error](https://libmdbx.dqdkfa.ru/dead-github/issues/242) because of update an obsolete meta-page. + - [Fixed returning `MDBX_NOTFOUND` error](https://libmdbx.dqdkfa.ru/dead-github/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. + - [Fixed compilation](https://libmdbx.dqdkfa.ru/dead-github/issues/245) without kernel/libc-devel headers. Minors: @@ -626,13 +980,16 @@ Minors: - Remove unneeded `#undef P_DIRTY`. +-------------------------------------------------------------------------------- + + ## v0.11.1 at 2021-10-23 ### Backward compatibility break: The database format signature has been changed to prevent forward-interoperability with an previous releases, which may lead to a -[false positive diagnosis of database corruption](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/238) +[false positive diagnosis of database corruption](https://libmdbx.dqdkfa.ru/dead-github/issues/238) due to flaws of an old library versions. This change is mostly invisible: @@ -640,12 +997,12 @@ This change is mostly invisible: - previously versions are unable to read/write a new DBs; - but the new release is able to handle an old DBs and will silently upgrade ones. -Acknowledgements: +Acknowledgments: - [Alex Sharov](https://github.com/AskAlexSharov) for reporting and testing. -------------------------------------------------------------------------------- +******************************************************************************** ## v0.10.5 at 2021-10-13 (obsolete, please use v0.11.1) @@ -658,7 +1015,7 @@ Unfortunately, the `v0.10.5` accidentally comes not full-compatible with previou This cannot be fixed, as it requires fixing past versions, which as a result we will just get a current version. Therefore, it is recommended to use `v0.11.1` instead of `v0.10.5`. -Acknowledgements: +Acknowledgments: - [Noel Kuntze](https://github.com/Thermi) for immediately bug reporting. @@ -674,9 +1031,12 @@ Minors: - Refined providing information for the `@MAIN` and `@GC` sub-databases of a last committed modification transaction's ID. +-------------------------------------------------------------------------------- + + ## v0.10.4 at 2021-10-10 -Acknowledgements: +Acknowledgments: - [Artem Vorotnikov](https://github.com/vorot93) for support [Rust wrapper](https://github.com/vorot93/libmdbx-rs). - [Andrew Ashikhmin](https://github.com/yperbasis) for contributing to C++ API. @@ -684,7 +1044,7 @@ Acknowledgements: Fixes: - Fixed possibility of looping update GC during transaction commit (no public issue since the problem was discovered inside [Positive Technologies](https://www.ptsecurity.ru)). - - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/235). + - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://libmdbx.dqdkfa.ru/dead-github/issues/235). - Fixed `noexcept` for potentially throwing `txn::put()` of C++ API. Minors: @@ -694,9 +1054,12 @@ Minors: - In debugging builds fixed a too small (single page) by default DB shrink threshold. +-------------------------------------------------------------------------------- + + ## v0.10.3 at 2021-08-27 -Acknowledgements: +Acknowledgments: - [Francisco Vallarino](https://github.com/fjvallarino) for [Haskell bindings for libmdbx](https://hackage.haskell.org/package/libmdbx). - [Alex Sharov](https://github.com/AskAlexSharov) for reporting and testing. @@ -710,7 +1073,7 @@ Extensions and improvements: Fixes: - - Always setup `madvise` while opening DB (fixes https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/231). + - Always setup `madvise` while opening DB (fixes https://libmdbx.dqdkfa.ru/dead-github/issues/231). - Fixed checking legacy `P_DIRTY` flag (`0x10`) for nested/sub-pages. Minors: @@ -722,20 +1085,23 @@ Minors: - Fixed CMake warning about compatibility with 3.8.2 +-------------------------------------------------------------------------------- + + ## v0.10.2 at 2021-07-26 -Acknowledgements: +Acknowledgments: - [Alex Sharov](https://github.com/AskAlexSharov) for reporting and testing. - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for reporting bugs. - [Lionel Debroux](https://github.com/debrouxl) for fuzzing tests and reporting bugs. - [Sergey Fedotov](https://github.com/SergeyFromHell/) for [`node-mdbx` NodeJS bindings](https://www.npmjs.com/package/node-mdbx). - [Kris Zyp](https://github.com/kriszyp) for [`lmdbx-store` NodeJS bindings](https://github.com/kriszyp/lmdbx-store). - - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings). + - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://libmdbx.dqdkfa.ru/dead-github/commits/python-bindings). New features, extensions and improvements: - - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/201). + - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://libmdbx.dqdkfa.ru/dead-github/issues/201). - Added options support for `long-stochastic` script. - Avoided `MDBX_TXN_FULL` error for large transactions when possible. - The `MDBX_READERS_LIMIT` increased to `32767`. @@ -743,7 +1109,7 @@ New features, extensions and improvements: - Minimized the size of poisoned/unpoisoned regions to avoid Valgrind/ASAN stuck. - Added more workarounds for QEMU for testing builds for 32-bit platforms, Alpha and Sparc architectures. - `mdbx_chk` now skips iteration & checking of DB' records if corresponding page-tree is corrupted (to avoid `SIGSEGV`, ASAN failures, etc). - - Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). + - Added more checks for [rare/fuzzing corruption cases](https://libmdbx.dqdkfa.ru/dead-github/issues/217). Backward compatibility break: @@ -755,23 +1121,26 @@ Backward compatibility break: Fixes: - Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors. - - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/203). - - Fixed [log a warning during a new DB creation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/205). - - Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/207). - - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/208). - - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/209). - - Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/191). + - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://libmdbx.dqdkfa.ru/dead-github/issues/203). + - Fixed [log a warning during a new DB creation](https://libmdbx.dqdkfa.ru/dead-github/issues/205). + - Fixed [false-negative `mdbx_cursor_eof()` result](https://libmdbx.dqdkfa.ru/dead-github/issues/207). + - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://libmdbx.dqdkfa.ru/dead-github/issues/208). + - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://libmdbx.dqdkfa.ru/dead-github/issues/209). + - Fixed [C++ Buffer issue with `std::string` and alignment](https://libmdbx.dqdkfa.ru/dead-github/issues/191). - Fixed `safe64_reset()` for platforms without atomic 64-bit compare-and-swap. - Fixed hang/shutdown on big-endian platforms without `__cxa_thread_atexit()`. - - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). + - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://libmdbx.dqdkfa.ru/dead-github/issues/217). - Fixed extra `noexcept` for `buffer::&assign_reference()`. - Fixed `bootid` generation on Windows for case of change system' time. - - Fixed [test framework keygen-related issue](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/127). + - Fixed [test framework keygen-related issue](https://libmdbx.dqdkfa.ru/dead-github/issues/127). + + +-------------------------------------------------------------------------------- ## v0.10.1 at 2021-06-01 -Acknowledgements: +Acknowledgments: - [Alexey Akhunov](https://github.com/AlexeyAkhunov) and [Alex Sharov](https://github.com/AskAlexSharov) for bug reporting and testing. - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for bug reporting and testing related to WSL2. @@ -787,15 +1156,18 @@ New features: Fixes: - Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library. - - Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/197). + - Fixed confusing/messy errors when build library from unfit github's archives (https://libmdbx.dqdkfa.ru/dead-github/issues/197). - Fixed `#​e​l​s​i​f` typo. - - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/195). - - Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/97). + - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://libmdbx.dqdkfa.ru/dead-github/issues/195). + - Re-Fixed WSL1/WSL2 detection with distinguishing (https://libmdbx.dqdkfa.ru/dead-github/issues/97). + + +-------------------------------------------------------------------------------- ## v0.10.0 at 2021-05-09 -Acknowledgements: +Acknowledgments: - [Mahlon E. Smith](https://github.com/mahlonsmith) for [Ruby bindings](https://rubygems.org/gems/mdbx/). - [Alex Sharov](https://github.com/AskAlexSharov) for [mdbx-go](https://github.com/torquem-ch/mdbx-go), bug reporting and testing. @@ -813,7 +1185,7 @@ New features: and conjointly with the `MDBX_ENV_CHECKPID=0` and `MDBX_TXN_CHECKOWNER=0` options can yield up to 30% more performance compared to LMDB. - Using float point (exponential quantized) representation for internal 16-bit values - of grow step and shrink threshold when huge ones (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/166). + of grow step and shrink threshold when huge ones (https://libmdbx.dqdkfa.ru/dead-github/issues/166). To minimize the impact on compatibility, only the odd values inside the upper half of the range (i.e. 32769..65533) are used for the new representation. - Added the `mdbx_drop` similar to LMDB command-line tool to purge or delete (sub)database(s). @@ -822,7 +1194,7 @@ New features: - The internal node sizes were refined, resulting in a reduction in large/overflow pages in some use cases and a slight increase in limits for a keys size to ≈½ of page size. - Added to `mdbx_chk` output number of keys/items on pages. - - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/180). + - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://libmdbx.dqdkfa.ru/dead-github/pull/180). - Major rework page splitting (af9b7b560505684249b76730997f9e00614b8113) for - An "auto-appending" feature upon insertion for both ascending and descending key sequences. As a result, the optimality of page filling @@ -830,7 +1202,7 @@ New features: inserting ordered sequences of keys, - A "splitting at middle" to make page tree more balanced on average. - Added `mdbx_get_sysraminfo()` to the API. - - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). + - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://libmdbx.dqdkfa.ru/dead-github/issues/183). - Major rework internal labeling of a dirty pages (958fd5b9479f52f2124ab7e83c6b18b04b0e7dda) for a "transparent spilling" feature with the gist to make a dirty pages be ready to spilling (writing to a disk) without further altering ones. @@ -846,7 +1218,7 @@ New features: - Support `make help` to list available make targets. - Silently `make`'s build by default. - Preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) is available now - by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/147). + by [Noel Kuntze](https://github.com/Thermi) (https://libmdbx.dqdkfa.ru/dead-github/issues/147). Backward compatibility break: @@ -861,30 +1233,30 @@ Backward compatibility break: Fixes: - - Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/160). - - Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/168). + - Fixed performance regression due non-optimal C11 atomics usage (https://libmdbx.dqdkfa.ru/dead-github/issues/160). + - Fixed "reincarnation" of subDB after it deletion (https://libmdbx.dqdkfa.ru/dead-github/issues/168). - Fixed (disallowing) implicit subDB deletion via operations on `@MAIN`'s DBI-handle. - - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171). - - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/170). - - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/179). - - Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). + - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://libmdbx.dqdkfa.ru/dead-github/issues/171). + - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://libmdbx.dqdkfa.ru/dead-github/issues/170). + - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://libmdbx.dqdkfa.ru/dead-github/issues/179). + - Fixed an unreasonably huge default upper limit for DB geometry (https://libmdbx.dqdkfa.ru/dead-github/issues/183). - Fixed `constexpr` specifier for the `slice::invalid()`. - - Fixed (no)readahead auto-handling (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/164). + - Fixed (no)readahead auto-handling (https://libmdbx.dqdkfa.ru/dead-github/issues/164). - Fixed non-alloy build for Windows. - Switched to using Heap-functions instead of LocalAlloc/LocalFree on Windows. - - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/190). + - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://libmdbx.dqdkfa.ru/dead-github/issues/190). - Fixed building by GCC 4.8.5 (added workaround for a preprocessor's bug). - Fixed building C++ part for iOS <= 13.0 (unavailability of `std::filesystem::path`). - Fixed building for Windows target versions prior to Windows Vista (`WIN32_WINNT < 0x0600`). - - Fixed building by MinGW for Windows (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/155). + - Fixed building by MinGW for Windows (https://libmdbx.dqdkfa.ru/dead-github/issues/155). -------------------------------------------------------------------------------- +******************************************************************************** ## v0.9.3 at 2021-02-02 -Acknowledgements: +Acknowledgments: - [Mahlon E. Smith](http://www.martini.nu/) for [FreeBSD port of libmdbx](https://svnweb.freebsd.org/ports/head/databases/mdbx/). - [장세연](http://www.castis.com) for bug fixing and PR. @@ -899,7 +1271,7 @@ Removed options and features: New features: - Package for FreeBSD is available now by Mahlon E. Smith. - - New API functions to get/set various options (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/128): + - New API functions to get/set various options (https://libmdbx.dqdkfa.ru/dead-github/issues/128): - the maximum number of named databases for the environment; - the maximum number of threads/reader slots; - threshold (since the last unsteady commit) to force flush the data buffers to disk; @@ -912,7 +1284,7 @@ New features: - maximal part of the dirty pages may be spilled when necessary; - minimal part of the dirty pages should be spilled when necessary; - how much of the parent transaction dirty pages will be spilled while start each child transaction; - - Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). + - Unlimited/Dynamic size of retired and dirty page lists (https://libmdbx.dqdkfa.ru/dead-github/issues/123). - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. - Reworked spilling of large transaction and committing of nested transactions: - page spilling code reworked to avoid the flaws and bugs inherited from LMDB; @@ -922,27 +1294,30 @@ New features: - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. - Added `mdbx_default_pagesize()` function. - Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic). - - Speed up page number lists and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/132). + - Speed up page number lists and dirty page lists (https://libmdbx.dqdkfa.ru/dead-github/issues/132). - Added `LIBMDBX_NO_EXPORTS_LEGACY_API` build option. Fixes: - - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/143). + - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://libmdbx.dqdkfa.ru/dead-github/pull/143). - Fixed `mdbx_realloc()` for case of nullptr and `MDBX_WITHOUT_MSVC_CRT=ON` for Windows. - - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/146). - - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). + - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://libmdbx.dqdkfa.ru/dead-github/issues/146). + - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://libmdbx.dqdkfa.ru/dead-github/issues/153). - Fixed minor/potential memory leak during page flushing and unspilling. - Fixed handling states of cursors's and subDBs's for nested transactions. - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. - - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). - - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). + - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://libmdbx.dqdkfa.ru/dead-github/issues/153). + - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://libmdbx.dqdkfa.ru/dead-github/issues/123). - Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint. - - Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/157). + - Fixed operation on systems with unusual small/large page size, including PowerPC (https://libmdbx.dqdkfa.ru/dead-github/issues/157). + + +-------------------------------------------------------------------------------- ## v0.9.2 at 2020-11-27 -Acknowledgements: +Acknowledgments: - Jens Alfke (Mobile Architect at [Couchbase](https://www.couchbase.com/)) for [NimDBX](https://github.com/snej/nimdbx). - Clément Renault (CTO at [MeiliSearch](https://www.meilisearch.com/)) for [mdbx-rs](https://github.com/Kerollmops/mdbx-rs). @@ -975,11 +1350,11 @@ Fixes: - Fixed copy&paste typos. - Fixed minor false-positive GCC warning. - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. - - Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/121). + - Fixed cursor state after multimap/dupsort repeated deletes (https://libmdbx.dqdkfa.ru/dead-github/issues/121). - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. - - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/131). - - Fixed spilled pages checking (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/126). - - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/136). + - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://libmdbx.dqdkfa.ru/dead-github/issues/131). + - Fixed spilled pages checking (https://libmdbx.dqdkfa.ru/dead-github/issues/126). + - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://libmdbx.dqdkfa.ru/dead-github/issues/136). - Fixed save/restore/commit of cursors for nested transactions. - Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on). - Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation). @@ -990,6 +1365,9 @@ Fixes: - Added handling `EXCEPTION_POSSIBLE_DEADLOCK` condition for Windows. +-------------------------------------------------------------------------------- + + ## v0.9.1 2020-09-30 Added features: @@ -1032,10 +1410,13 @@ Fixes: - Fix a lot of typos & spelling (Thanks to Josh Soref for PR). - Fix `getopt()` messages for Windows (Thanks to Andrey Sporaw for reporting). - Fix MSVC compiler version requirements (Thanks to Andrey Sporaw for reporting). - - Workarounds for QEMU's bugs to run tests for cross-builded library under QEMU. + - Workarounds for QEMU's bugs to run tests for cross-built[A library under QEMU. - Now C++ compiler optional for building by CMake. +-------------------------------------------------------------------------------- + + ## v0.9.0 2020-07-31 (not a release, but API changes) Added features: @@ -1049,7 +1430,7 @@ Deprecated functions and flags: Please use the value-to-key functions to provide keys that are compatible with the built-in libmdbx comparators. -------------------------------------------------------------------------------- +******************************************************************************** ## 2020-07-06 @@ -1101,7 +1482,7 @@ Deprecated functions and flags: - Avoid using `pwritev()` for single-writes (up to 10% speedup for some kernels & scenarios). - Avoiding `MDBX_UTTERLY_NOSYNC` as result of flags merge. - Add `mdbx_dbi_dupsort_depthmask()` function. - - Add `MDBX_CP_FORCE_RESIZEABLE` option. + - Add `MDBX_CP_FORCE_RESIZABLE` option. - Add deprecated `MDBX_MAP_RESIZED` for compatibility. - Add `MDBX_BUILD_TOOLS` option (default `ON`). - Refine `mdbx_dbi_open_ex()` to safe concurrently opening the same handle from different threads. @@ -1189,6 +1570,8 @@ Deprecated functions and flags: - API description. - Checking for non-local filesystems to avoid DB corruption. -------------------------------------------------------------------------------- + +******************************************************************************** + For early changes see the git commit history. diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile index f9c8ae29f..35e7849e8 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/GNUmakefile @@ -320,7 +320,7 @@ IOARENA := $(shell \ (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena 2>&- || \ - (echo false && echo '$(TIP) Clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) + (echo false && echo '$(TIP) Clone and build the https://abf.io/erthink/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) endif NN ?= 25000000 BENCH_CRUD_MODE ?= nosync @@ -334,7 +334,7 @@ re-bench: bench-clean bench ifeq ($(or $(IOARENA),false),false) bench bench-quartet bench-triplet bench-couple: $(QUIET)echo 'The `ioarena` benchmark is required.' >&2 && \ - echo 'Please clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ + echo 'Please clone and build the https://abf.io/erthink/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ false else @@ -345,15 +345,20 @@ define bench-rule bench-$(1)_$(2).txt: $(3) $(IOARENA) $(lastword $(MAKEFILE_LIST)) @echo ' RUNNING ioarena for $1/$2...' $(QUIET)(export LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}"; \ - ldd $(IOARENA) && \ + ldd $(IOARENA) | grep -i $(1) && \ + $(IOARENA) -D $(1) -B batch -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee $$@ | grep throughput | sed 's/throughput/batch×N/' && \ $(IOARENA) -D $(1) -B crud -m $(BENCH_CRUD_MODE) -n $(2) \ - | tee $$@ | grep throughput && \ + | tee -a $$@ | grep throughput | sed 's/throughput/ crud/' && \ $(IOARENA) -D $(1) -B iterate,get,iterate,get,iterate -m $(BENCH_CRUD_MODE) -r 4 -n $(2) \ - | tee -a $$@ | grep throughput \ - ) || mv -f $$@ $$@.error + | tee -a $$@ | grep throughput | sed '0,/throughput/{s/throughput/iterate/};s/throughput/ get/' && \ + $(IOARENA) -D $(1) -B delete -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee -a $$@ | grep throughput | sed 's/throughput/ delete/' && \ + true) || mv -f $$@ $$@.error endef + $(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) $(eval $(call bench-rule,sophia,$(NN))) diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/README.md b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/README.md index d25189b83..46e1c5492 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/README.md +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/README.md @@ -81,19 +81,48 @@ Historically, _libmdbx_ is a deeply revised and extended descendant of the amazi [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). _libmdbx_ inherits all benefits from _LMDB_, but resolves some issues and adds [a set of improvements](#improvements-beyond-lmdb). +### MithrilDB and Future + -The next version is under active non-public development from scratch and will be +The next version is under non-public development from scratch and will be released as **MithrilDB** and `libmithrildb` for libraries & packages. Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is resembling silver but being stronger and lighter than steel. Therefore _MithrilDB_ is a rightly relevant name. - > _MithrilDB_ will be radically different from _libmdbx_ by the new - > database format and API based on C++17, as well as the [Apache 2.0 - > License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this - > revolution is to provide a clearer and robust API, add more features and - > new valuable properties of the database. +_MithrilDB_ is radically different from _libmdbx_ by the new database +format and API based on C++20. The goal of this revolution is to provide +a clearer and robust API, add more features and new valuable properties +of the database. All fundamental architectural problems of libmdbx/LMDB +have been solved there, but now the active development has been +suspended for top-three reasons: + +1. For now _libmdbx_ «mostly» enough for all [our products](https://www.ptsecurity.com/ww-en/products/), +and I’m busy in development of replication for scalability. +2. Waiting for fresh [Elbrus CPU](https://wiki.elbrus.ru/) of [e2k architecture](https://en.wikipedia.org/wiki/Elbrus_2000), +especially with hardware acceleration of [Streebog](https://en.wikipedia.org/wiki/Streebog) and +[Kuznyechik](https://en.wikipedia.org/wiki/Kuznyechik), which are required for Merkle tree, etc. +3. The expectation of needs and opportunities due to the wide use of NVDIMM (aka persistent memory), +modern NVMe and [Ангара](https://ru.wikipedia.org/wiki/Ангара_(интерконнект)). + +However, _MithrilDB_ will not be available for countries unfriendly to +Russia (i.e. acceded the sanctions, devil adepts and/or NATO). But it is +not yet known whether such restriction will be implemented only through +a license and support, either the source code will not be open at all. +Basically we are not inclined to allow our work to contribute to the +profit that goes to weapons that kill our relatives and friends. +NO OPTIONS. + +Nonetheless, I try not to make any promises regarding _MithrilDB_ until release. + +Contrary to _MithrilDB_, _libmdbx_ will forever free and open source. +Moreover with high-quality support whenever possible. Tu deviens +responsable pour toujours de ce que tu as apprivois. So we will continue +to comply with the original open license and the principles of +constructive cooperation, in spite of outright Github sabotage and +sanctions. I will also try to keep (not drop) Windows support, despite +it is an unused obsolete technology for us. @@ -248,7 +277,7 @@ the user's point of view. > and up to 30% faster when _libmdbx_ compiled with specific build options > which downgrades several runtime checks to be match with LMDB behaviour. > - > These and other results could be easily reproduced with [ioArena](https://github.com/pmwkaa/ioarena) just by `make bench-quartet` command, + > These and other results could be easily reproduced with [ioArena](https://abf.io/erthink/ioarena.git) just by `make bench-quartet` command, > including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB) > and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger). @@ -435,7 +464,7 @@ unexpected or broken down. ### Testing The amalgamated source code does not contain any tests for or several reasons. -Please read [the explanation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this. +Please read [the explanation](https://libmdbx.dqdkfa.ru/dead-github/issues/214#issuecomment-870717981) and don't ask to alter this. So for testing _libmdbx_ itself you need a full source code, i.e. the clone of a git repository, there is no option. The full source code of _libmdbx_ has a [`test` subdirectory](https://gitflic.ru/project/erthink/libmdbx/tree/master/test) with minimalistic test "framework". @@ -618,7 +647,7 @@ Bindings | Rust | [libmdbx-rs](https://github.com/vorot93/libmdbx-rs) | [Artem Vorotnikov](https://github.com/vorot93) | | Rust | [mdbx](https://crates.io/crates/mdbx) | [gcxfd](https://github.com/gcxfd) | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | -| Python (draft) | [python-bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) +| Python (draft) | [python-bindings](https://libmdbx.dqdkfa.ru/dead-github/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) | .NET (obsolete) | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | @@ -630,7 +659,7 @@ Bindings Performance comparison ====================== -All benchmarks were done in 2015 by [IOArena](https://github.com/pmwkaa/ioarena) +All benchmarks were done in 2015 by [IOArena](https://abf.io/erthink/ioarena.git) and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz (2 physical cores, 4 HyperThreading cores), 8 Gb RAM, SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb. diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt index 26ff4d6cf..cbc73cc52 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/VERSION.txt @@ -1 +1 @@ -0.12.2.0 +0.12.6.0 diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/compiler.cmake b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/compiler.cmake index 78a31946f..1d805ea04 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/compiler.cmake +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/compiler.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. @@ -348,6 +348,8 @@ endif() if(MSVC) check_compiler_flag("/WX" CC_HAS_WERROR) + check_compiler_flag("/fsanitize=address" CC_HAS_ASAN) + check_compiler_flag("/fsanitize=undefined" CC_HAS_UBSAN) else() # # GCC started to warn for unused result starting from 4.2, and @@ -839,19 +841,26 @@ macro(setup_compile_flags) endif() if(ENABLE_ASAN) - add_compile_flags("C;CXX" "-fsanitize=address") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=address") + else() + add_compile_flags("C;CXX" "/fsanitize=address") + endif() add_definitions(-DASAN_ENABLED=1) endif() if(ENABLE_UBSAN) - add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + else() + add_compile_flags("C;CXX" "/fsanitize=undefined") + endif() add_definitions(-DUBSAN_ENABLED=1) endif() if(ENABLE_GCOV) if(NOT HAVE_GCOV) - message(FATAL_ERROR - "ENABLE_GCOV option requested but gcov library is not found") + message(FATAL_ERROR "ENABLE_GCOV option requested but gcov library is not found") endif() add_compile_flags("C;CXX" "-fprofile-arcs" "-ftest-coverage") diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/profile.cmake b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/profile.cmake index c9b8bed4d..f13b6976e 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/profile.cmake +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/profile.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/utils.cmake b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/utils.cmake index 6a3315e1e..aa8aef01f 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/utils.cmake +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/cmake/utils.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 index e0587e993..7c3b688b4 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_CHK 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 index 49e2b4d41..c8dce2988 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_copy.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . -.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_COPY 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 index ec01905b2..14924a76d 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ -.\" Copyright 2021-2022 Leonid Yuriev . +.\" Copyright 2021-2023 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_DROP 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 index 5e173903d..50c799e55 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_dump.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . -.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_DUMP 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 index 44dbe7d70..668fdbdb9 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_load.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . -.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_LOAD 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 index 3bc3664ab..d3f19f793 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/man1/mdbx_stat.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . -.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_STAT 1 "2023-04-29" "MDBX 0.12.6" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c index 1b73e82d5..e97392598 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -87,27 +87,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -117,8 +121,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -135,7 +139,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -428,8 +432,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -707,17 +711,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -728,17 +728,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -804,6 +800,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -975,7 +993,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1180,7 +1198,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1193,14 +1212,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1213,8 +1233,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1282,13 +1306,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1305,7 +1328,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1314,11 +1336,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1329,7 +1353,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1347,11 +1371,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1491,9 +1517,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1515,6 +1542,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1528,9 +1568,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1552,6 +1591,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1708,22 +1748,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1855,6 +1880,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1930,6 +1995,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1939,6 +2006,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1952,6 +2021,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1977,6 +2048,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2091,7 +2175,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2177,6 +2265,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2190,6 +2280,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2199,6 +2291,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2210,6 +2304,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2221,6 +2317,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2229,6 +2328,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2241,8 +2343,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2250,6 +2365,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2275,6 +2392,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2364,6 +2483,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2662,16 +2917,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2713,18 +2964,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2754,6 +3011,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2893,6 +3158,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2942,6 +3211,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3014,7 +3289,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3039,14 +3315,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3062,7 +3331,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3070,7 +3340,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3086,9 +3356,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3137,13 +3409,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3202,7 +3472,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3225,11 +3495,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3279,6 +3555,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3337,12 +3616,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3370,10 +3649,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3385,11 +3666,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3411,6 +3698,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3419,6 +3707,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3430,6 +3719,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3460,139 +3751,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3603,7 +3761,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3619,7 +3778,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3778,16 +3937,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3799,14 +3949,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -3890,7 +4040,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -4150,24 +4300,24 @@ static __always_inline void unaligned_poke_u64(const size_t expected_alignment, } #define UNALIGNED_PEEK_8(ptr, struct, field) \ - peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) + peek_u8(ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_8(ptr, struct, field, value) \ - poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) + poke_u8(ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_16(ptr, struct, field) \ - unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_16(ptr, struct, field, value) \ - unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_32(ptr, struct, field) \ - unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_32(ptr, struct, field, value) \ - unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_64(ptr, struct, field) \ - unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_64(ptr, struct, field, value) \ - unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value) /* Get the page number pointed to by a branch node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t @@ -4231,13 +4381,13 @@ static __always_inline void node_set_flags(MDBX_node *const __restrict node, /* Address of the key for the node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_key(const MDBX_node *const __restrict node) { - return (char *)node + NODESIZE; + return ptr_disp(node, NODESIZE); } /* Address of the data for a node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_data(const MDBX_node *const __restrict node) { - return (char *)node_key(node) + node_ks(node); + return ptr_disp(node_key(node), node_ks(node)); } /* Size of a node in a leaf page with a given key and data. @@ -4543,7 +4693,7 @@ pgno2bytes(const MDBX_env *env, size_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * pgno2page(const MDBX_env *env, size_t pgno) { - return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); + return ptr_disp(env->me_map, pgno2bytes(env, pgno)); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t @@ -4570,7 +4720,7 @@ bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { /* Address of first usable data byte in a page, after the header */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_data(const MDBX_page *mp) { - return (char *)mp + PAGEHDRSZ; + return ptr_disp(mp, PAGEHDRSZ); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * @@ -4642,7 +4792,7 @@ __cold static const char *pagetype_caption(const uint8_t type, } } -__cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) +__cold static int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { if (LOG_ENABLED(MDBX_LOG_ERROR)) { static const MDBX_page *prev; @@ -4689,7 +4839,7 @@ page_node(const MDBX_page *mp, size_t i) { assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); assert(page_numkeys(mp) > i); assert(mp->mp_ptrs[i] % 2 == 0); - return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); + return ptr_disp(mp, mp->mp_ptrs[i] + PAGEHDRSZ); } /* The address of a key in a LEAF2 page. @@ -4700,7 +4850,7 @@ page_leaf2key(const MDBX_page *mp, size_t i, size_t keysize) { assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; - return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); + return ptr_disp(mp, PAGEHDRSZ + i * mp->mp_leaf2_ksize); } /* Set the node's key into keyptr. */ @@ -5054,7 +5204,12 @@ static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { #define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) #define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) -static __thread uint64_t rthc_thread_state; +static __thread uint64_t rthc_thread_state +#if __has_attribute(tls_model) && \ + (defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY) + __attribute__((tls_model("local-dynamic"))) +#endif + ; #if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) @@ -5256,7 +5411,7 @@ __cold void thread_dtor(void *rthc) { if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), __Wpedantic_format_voidptr(reader)); - atomic_cas32(&reader->mr_pid, self_pid, 0); + (void)atomic_cas32(&reader->mr_pid, self_pid, 0); } } @@ -5414,7 +5569,7 @@ __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, goto bailout; } if (rthc_table == rthc_table_static) - memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); + memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit); rthc_table = new_table; rthc_limit *= 2; } @@ -5559,7 +5714,7 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { ? uniq_peek(pending, &scan->me_lck_mmap) : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { - uint64_t length; + uint64_t length = 0; if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ @@ -5571,7 +5726,7 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), - MDBX_SYNC_NONE); + MDBX_SYNC_KICK); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { @@ -6004,7 +6159,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } while (++r != end); \ \ if (unlikely(key_diff_mask < 256)) { \ - memcpy(begin, tmp, (char *)end - (char *)begin); \ + memcpy(begin, tmp, ptr_dist(end, begin)); \ break; \ } \ end = (r = tmp) + length; \ @@ -6102,11 +6257,11 @@ static __always_inline size_t pnl_size2bytes(size_t size) { #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + - MDBX_PNL_GRANULATE + 2) * + MDBX_PNL_GRANULATE + 3) * sizeof(pgno_t) < SIZE_MAX / 4 * 3); size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3), MDBX_PNL_GRANULATE * sizeof(pgno_t)) - MDBX_ASSUME_MALLOC_OVERHEAD; return bytes; @@ -6114,8 +6269,8 @@ static __always_inline size_t pnl_size2bytes(size_t size) { static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); - assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); - size -= 2; + assert(size > 3 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); + size -= 3; #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size >>= 1; #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ @@ -6131,8 +6286,8 @@ static MDBX_PNL pnl_alloc(size_t size) { #endif /* malloc_usable_size */ pl[0] = pnl_bytes2size(bytes); assert(pl[0] >= size); - pl[1] = 0; pl += 1; + *pl = 0; } return pl; } @@ -6151,8 +6306,9 @@ static void pnl_shrink(MDBX_PNL *ppl) { MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); MDBX_PNL_SETSIZE(*ppl, 0); if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > - MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); + MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) - + MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2); MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -6315,7 +6471,7 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, // clang<=5: cmov×2, set+add/sub // clang>=6: cmov, set+add/sub *dst = flag ? *src_a : *src_b; - src_b += flag - 1; + src_b += (ptrdiff_t)flag - 1; src_a -= flag; #endif --dst; @@ -6334,61 +6490,75 @@ __hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { const size_t src_len = MDBX_PNL_GETSIZE(src); const size_t dst_len = MDBX_PNL_GETSIZE(dst); size_t total = dst_len; + assert(MDBX_PNL_ALLOCLEN(dst) >= total); if (likely(src_len > 0)) { total += src_len; - assert(MDBX_PNL_ALLOCLEN(dst) >= total); - dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); - pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) + goto avoid_call_libc_for_short_cases; + if (dst_len == 0 || + MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src))) + memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t)); + else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) { + memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst), + dst_len * sizeof(pgno_t)); + memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src), + src_len * sizeof(pgno_t)); + } else { + avoid_call_libc_for_short_cases: + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + } MDBX_PNL_SETSIZE(dst, total); } assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); return total; } -static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { - tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spill_pages) && - txn->tw.spill_least_removed > 0); - txn->tw.spill_least_removed = - (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); +static void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && + txn->tw.spilled.least_removed > 0); + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); while (unlikely(npages > 1)) { - const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; + const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; if (MDBX_PNL_ASCENDING) { - if (++idx > MDBX_PNL_GETSIZE(txn->tw.spill_pages) || - (txn->tw.spill_pages[idx] >> 1) != pgno) + if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || + (txn->tw.spilled.list[idx] >> 1) != pgno) return; } else { - if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) + if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) return; - txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) - ? idx - : txn->tw.spill_least_removed; + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; } - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); --npages; } } static MDBX_PNL spill_purge(MDBX_txn *txn) { - tASSERT(txn, txn->tw.spill_least_removed > 0); - const MDBX_PNL sl = txn->tw.spill_pages; - if (txn->tw.spill_least_removed != INT_MAX) { + tASSERT(txn, txn->tw.spilled.least_removed > 0); + const MDBX_PNL sl = txn->tw.spilled.list; + if (txn->tw.spilled.least_removed != INT_MAX) { size_t len = MDBX_PNL_GETSIZE(sl), r, w; - for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { + for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { sl[w] = sl[r]; w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) tASSERT(txn, (sl[i] & 1) == 0); MDBX_PNL_SETSIZE(sl, w - 1); - txn->tw.spill_least_removed = INT_MAX; + txn->tw.spilled.least_removed = INT_MAX; } else { for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) tASSERT(txn, (sl[i] & 1) == 0); @@ -6438,23 +6608,34 @@ __hot __noinline static size_t pnl_search_nochk(const MDBX_PNL pnl, static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, size_t limit) { assert(pnl_check_allocated(pnl, limit)); + if (MDBX_HAVE_CMOV) { + /* cmov-ускоренный бинарный поиск может читать (но не использовать) один + * элемент за концом данных, этот элемент в пределах выделенного участка + * памяти, но не инициализирован. */ + VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } assert(pgno < limit); (void)limit; - return pnl_search_nochk(pnl, pgno); + size_t n = pnl_search_nochk(pnl, pgno); + if (MDBX_HAVE_CMOV) { + VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } + return n; } static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { - const MDBX_PNL pnl = txn->tw.spill_pages; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return 0; pgno <<= 1; - size_t n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); + size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1); return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; } static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - pgno_t npages) { - const MDBX_PNL pnl = txn->tw.spill_pages; + size_t npages) { + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return false; const size_t len = MDBX_PNL_GETSIZE(pnl); @@ -6466,7 +6647,7 @@ static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, DEBUG_EXTRA_PRINT("%s\n", "]"); } const pgno_t spilled_range_begin = pgno << 1; - const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; + const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1; #if MDBX_PNL_ASCENDING const size_t n = pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); @@ -6475,7 +6656,7 @@ static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; #else const size_t n = - pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1); assert(n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_last >= pnl[n])); const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; #endif @@ -6514,8 +6695,8 @@ static MDBX_TXL txl_alloc(void) { #endif /* malloc_usable_size */ tl[0] = txl_bytes2size(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); - tl[1] = 0; tl += 1; + *tl = 0; } return tl; } @@ -6588,11 +6769,9 @@ static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { /*----------------------------------------------------------------------------*/ -#define MDBX_DPL_UNSORTED_BACKLOG 16 -#define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG -#define MDBX_DPL_GAP_FOR_EDGING 2 -#define MDBX_DPL_RESERVE_GAP \ - (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) +#define MDBX_DPL_GAP_MERGESORT 16 +#define MDBX_DPL_GAP_EDGING 2 +#define MDBX_DPL_RESERVE_GAP (MDBX_DPL_GAP_MERGESORT + MDBX_DPL_GAP_EDGING) static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); @@ -6625,25 +6804,32 @@ static __always_inline size_t dpl_bytes2size(const ptrdiff_t bytes) { } static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { - static const MDBX_page dpl_stub_pageE = { - {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; + static const MDBX_page dpl_stub_pageE = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ ~(pgno_t)0}; assert(dpl_stub_pageE.mp_flags == P_BAD && dpl_stub_pageE.mp_pgno == P_INVALID); dl->length = len; dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; dl->items[len + 1].pgno = P_INVALID; - dl->items[len + 1].extra = 0; + dl->items[len + 1].npages = 1; return len; } static __always_inline void dpl_clear(MDBX_dpl *dl) { - static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; + static const MDBX_page dpl_stub_pageB = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ 0}; assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); dl->sorted = dpl_setlen(dl, 0); dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; - dl->items[0].extra = 0; + dl->items[0].npages = 1; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } @@ -6676,19 +6862,19 @@ static int dpl_alloc(MDBX_txn *txn) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) - ? txn->mt_env->me_options.dp_initial - : txn->mt_geo.upper; - if (txn->tw.dirtylist) { - dpl_clear(txn->tw.dirtylist); - const int realloc_threshold = 64; - if (likely( - !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold || - (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) - return MDBX_SUCCESS; - } - if (unlikely(!dpl_reserve(txn, wanna))) + const size_t wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) + ? txn->mt_env->me_options.dp_initial + : txn->mt_geo.upper; +#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG + if (txn->tw.dirtylist) + /* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */ + txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0; +#endif /* asertions enabled */ + if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || + txn->tw.dirtylist->detent > wanna + wanna) && + unlikely(!dpl_reserve(txn, wanna))) return MDBX_ENOMEM; + dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; } @@ -6711,7 +6897,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { if (dl->sorted > unsorted / 4 + 4 && (MDBX_DPL_PREALLOC_FOR_RADIXSORT || - dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) { + dl->length + unsorted < dl->detent + MDBX_DPL_GAP_MERGESORT)) { MDBX_dp *const sorted_begin = dl->items + 1; MDBX_dp *const sorted_end = sorted_begin + dl->sorted; MDBX_dp *const end = @@ -6734,7 +6920,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { #else *w = cmp ? *l : *r; l -= cmp; - r += cmp - 1; + r += (ptrdiff_t)cmp - 1; #endif } while (likely(--w > l)); assert(r == tmp - 1); @@ -6819,7 +7005,7 @@ __hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned dpl_npages(const MDBX_dpl *dl, size_t i) { assert(0 <= (intptr_t)i && i <= dl->length); - unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; + unsigned n = dl->items[i].npages; assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); return n; } @@ -6830,7 +7016,7 @@ dpl_endpgno(const MDBX_dpl *dl, size_t i) { } static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -6857,7 +7043,8 @@ static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) { +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +dpl_exist(const MDBX_txn *txn, pgno_t pgno) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; size_t i = dpl_search(txn, pgno); @@ -6888,7 +7075,7 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, return nullptr; } -static void dpl_remove_ex(const MDBX_txn *txn, size_t i, pgno_t npages) { +static void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -6907,31 +7094,64 @@ static void dpl_remove(const MDBX_txn *txn, size_t i) { dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); } +static __noinline void txn_lru_reduce(MDBX_txn *txn) { + NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + do { + txn->tw.dirtylru >>= 1; + MDBX_dpl *dl = txn->tw.dirtylist; + for (size_t i = 1; i <= dl->length; ++i) { + size_t *const ptr = + ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr >>= 1; + } + txn = txn->mt_parent; + } while (txn); +} + +MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, + size_t i) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((intptr_t)i > 0 && i <= dl->length); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + return txn->tw.dirtylru - (uint32_t)*ptr; +} + +static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { + txn->tw.dirtylru += 1; + if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && + (txn->mt_flags & MDBX_WRITEMAP) == 0) + txn_lru_reduce(txn); + return txn->tw.dirtylru; +} + static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_dp dp = {page, pgno, (pgno_t)npages}; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } + MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); if (AUDIT_ENABLED()) { for (size_t i = dl->length; i > 0; --i) { - assert(dl->items[i].pgno != pgno); - if (unlikely(dl->items[i].pgno == pgno)) { - ERROR("Page %u already exist in the DPL at %zu", pgno, i); + assert(dl->items[i].pgno != dp.pgno); + if (unlikely(dl->items[i].pgno == dp.pgno)) { + ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i); return MDBX_PROBLEM; } } } - const size_t length = dl->length + 1; - const size_t sorted = - (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) - ? length - : dl->sorted; - if (unlikely(dl->length == dl->detent)) { if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); @@ -6946,27 +7166,79 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, tASSERT(txn, dl->length < dl->detent); } - /* copy the stub beyond the end */ - dl->items[length + 1] = dl->items[length]; - /* append page */ - dl->items[length].ptr = page; - dl->items[length].pgno = pgno; - dl->items[length].multi = npages > 1; - dl->items[length].lru = txn->tw.dirtylru++; - dl->length = length; - dl->sorted = sorted; - dl->pages_including_loose += npages; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return MDBX_SUCCESS; -} + /* Сортировка нужна для быстрого поиска, используем несколько тактик: + * 1) Сохраняем упорядоченность при естественной вставке в нужном порядке. + * 2) Добавляем в не-сортированный хвост, который сортируем и сливаем + * с отсортированной головой по необходимости, а пока хвост короткий + * ищем в нём сканированием, избегая большой пересортировки. + * 3) Если не-сортированный хвост короткий, а добавляемый элемент близок + * к концу отсортированной головы, то выгоднее сразу вставить элемент + * в нужное место. + * + * Алгоритмически: + * - добавлять в не-сортированный хвост следует только если вставка сильно + * дорогая, т.е. если целевая позиция элемента сильно далека от конца; + * - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим + * от конца на максимально-приемлемое расстояние; + * - если список короче, либо элемент в этой позиции меньше вставляемого, + * то следует перемещать элементы и вставлять в отсортированную голову; + * - если не-сортированный хвост длиннее, либо элемент в этой позиции больше, + * то следует добавлять в не-сортированный хвост. */ -static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_dpl *dl = txn->tw.dirtylist; - assert((intptr_t)i > 0 && i <= dl->length); - /* overflow could be here */ - return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); + dl->pages_including_loose += npages; + MDBX_dp *i = dl->items + dl->length; + +#define MDBX_DPL_INSERTION_THRESHOLD 42 + const ptrdiff_t pivot = (ptrdiff_t)dl->length - MDBX_DPL_INSERTION_THRESHOLD; +#if MDBX_HAVE_CMOV + const pgno_t pivot_pgno = + dl->items[(dl->length < MDBX_DPL_INSERTION_THRESHOLD) + ? 0 + : dl->length - MDBX_DPL_INSERTION_THRESHOLD] + .pgno; +#endif /* MDBX_HAVE_CMOV */ + + /* copy the stub beyond the end */ + i[2] = i[1]; + dl->length += 1; + + if (likely(pivot <= (ptrdiff_t)dl->sorted) && +#if MDBX_HAVE_CMOV + pivot_pgno < dp.pgno) { +#else + (pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) { +#endif /* MDBX_HAVE_CMOV */ + dl->sorted += 1; + + /* сдвигаем несортированный хвост */ + while (i >= dl->items + dl->sorted) { +#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */ + i[1] = *i; +#elif MDBX_WORDBITS == 64 && \ + (defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) + STATIC_ASSERT(sizeof(MDBX_dp) == sizeof(__uint128_t)); + ((__uint128_t *)i)[1] = *(volatile __uint128_t *)i; +#else + i[1].ptr = i->ptr; + i[1].pgno = i->pgno; + i[1].npages = i->npages; +#endif + --i; + } + /* ищем нужную позицию сдвигая отсортированные элементы */ + while (i->pgno > pgno) { + tASSERT(txn, i > dl->items); + i[1] = *i; + --i; + } + tASSERT(txn, i->pgno < dp.pgno); + } + + i[1] = dp; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + assert(dl->sorted <= dl->length); + return MDBX_SUCCESS; } /*----------------------------------------------------------------------------*/ @@ -6979,7 +7251,7 @@ static __must_check_result __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - pgno_t npages); + size_t npages); typedef struct page_result { MDBX_page *page; int err; @@ -6988,10 +7260,10 @@ typedef struct page_result { static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); -static pgr_t page_new_large(MDBX_cursor *mc, const pgno_t npages); +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); static int page_touch(MDBX_cursor *mc); -static int cursor_touch(MDBX_cursor *mc); -static int touch_dbi(MDBX_cursor *mc); +static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data); #define MDBX_END_NAMES \ { \ @@ -7016,27 +7288,27 @@ enum { static int txn_end(MDBX_txn *txn, const unsigned mode); static __always_inline pgr_t page_get_inline(const uint16_t ILL, - MDBX_cursor *const mc, + const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front); -static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno, +static pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS, mc, pgno, front); } -__hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno, - const txnid_t front) { +__hot static pgr_t page_get_three(const MDBX_cursor *const mc, + const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); } -static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno, +static pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, front); } -static __always_inline int __must_check_result page_get(MDBX_cursor *mc, +static __always_inline int __must_check_result page_get(const MDBX_cursor *mc, const pgno_t pgno, MDBX_page **mp, const txnid_t front) { @@ -7062,9 +7334,8 @@ static int __must_check_result page_split(MDBX_cursor *mc, MDBX_val *const newdata, pgno_t newpgno, const unsigned naf); -static int coherency_timeout(uint64_t *timestamp, pgno_t pgno); -static bool coherency_check_meta(const MDBX_env *env, - const volatile MDBX_meta *meta, bool report); +static int coherency_timeout(uint64_t *timestamp, intptr_t pgno, + const MDBX_env *env); static int __must_check_result validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, MDBX_meta *dest); @@ -7110,14 +7381,25 @@ static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc); -static int __must_check_result page_check(MDBX_cursor *const mc, +static int __must_check_result page_check(const MDBX_cursor *const mc, const MDBX_page *const mp); -static int __must_check_result cursor_check(MDBX_cursor *mc); +static int __must_check_result cursor_check(const MDBX_cursor *mc); +static int __must_check_result cursor_get(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_put_checklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); +static int __must_check_result cursor_put_nochecklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); static int __must_check_result cursor_check_updating(MDBX_cursor *mc); -static int __must_check_result cursor_del(MDBX_cursor *mc); -static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, - const MDBX_val *data, unsigned flags); +static int __must_check_result cursor_del(MDBX_cursor *mc, + MDBX_put_flags_t flags); +static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, const MDBX_val *data, + unsigned flags); #define SIBLING_LEFT 0 #define SIBLING_RIGHT 2 static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); @@ -7154,11 +7436,8 @@ static int __must_check_result setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize); -static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, - cmp_int_unaligned, cmp_lenfast; - -static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags); -static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags); +static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags); +static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags); __cold const char *mdbx_liberr2str(int errnum) { /* Table of descriptions for MDBX errors */ @@ -7227,6 +7506,9 @@ __cold const char *mdbx_liberr2str(int errnum) { case MDBX_TXN_OVERLAPPING: return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" " the current thread"; + case MDBX_DUPLICATED_CLK: + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists, " + "please keep one and remove unused other"; default: return NULL; } @@ -7554,18 +7836,20 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { if (likely(num == 1 && np)) { eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, np, size); - VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dp_reserve = np->mp_next; + VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), + size + sizeof(size_t)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(np), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(np); env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); - np = osal_malloc(size); - if (unlikely(!np)) { + void *const ptr = osal_malloc(size + sizeof(size_t)); + if (unlikely(!ptr)) { txn->mt_flags |= MDBX_TXN_ERROR; - return np; + return nullptr; } - VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t)); + np = ptr_disp(ptr, sizeof(size_t)); } if ((env->me_flags & MDBX_NOMEMINIT) == 0) { @@ -7575,7 +7859,7 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { size_t skip = PAGEHDRSZ; if (num > 1) skip += pgno2bytes(env, num - 1); - memset((char *)np + skip, 0, size - skip); + memset(ptr_disp(np, skip), 0, size - skip); } #if MDBX_DEBUG np->mp_pgno = 0; @@ -7587,23 +7871,24 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { } /* Free a shadow dirty page */ -static void dpage_free(MDBX_env *env, MDBX_page *dp, pgno_t npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) memset(dp, -1, pgno2bytes(env, npages)); if (npages == 1 && env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { - MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), - env->me_psize - sizeof(dp->mp_next)); - dp->mp_next = env->me_dp_reserve; - VALGRIND_MEMPOOL_FREE(env, dp); + MDBX_ASAN_POISON_MEMORY_REGION(dp, env->me_psize); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + mp_next(dp) = env->me_dp_reserve; + VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t))); env->me_dp_reserve = dp; env->me_dp_reserve_len += 1; } else { /* large pages just get freed directly */ - VALGRIND_MEMPOOL_FREE(env, dp); - osal_free(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); } } @@ -7655,10 +7940,12 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - const uint32_t age = dpl_age(txn, i); - tASSERT(txn, age < UINT32_MAX / 3); - if (unlikely(age > UINT32_MAX / 3)) - return false; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) + return false; + } tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); if (dp->mp_flags == P_LOOSE) { @@ -7767,7 +8054,7 @@ static void refund_loose(MDBX_txn *txn) { tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; size_t w = 0; - for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { + for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { tASSERT(txn, lp->mp_flags == P_LOOSE); tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { @@ -7777,6 +8064,8 @@ static void refund_loose(MDBX_txn *txn) { suitable[++w] = lp->mp_pgno; most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } if (most + 1 == txn->mt_next_pgno) { @@ -7868,10 +8157,12 @@ static void refund_loose(MDBX_txn *txn) { for (MDBX_page **link = &txn->tw.loose_pages; *link;) { MDBX_page *dp = *link; tASSERT(txn, dp->mp_flags == P_LOOSE); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); if (txn->mt_next_pgno > dp->mp_pgno) { - link = &dp->mp_next; + link = &mp_next(dp); } else { - *link = dp->mp_next; + *link = mp_next(dp); if ((txn->mt_flags & MDBX_WRITEMAP) == 0) dpage_free(txn->mt_env, dp, 1); } @@ -7909,7 +8200,7 @@ static bool txn_refund(MDBX_txn *txn) { if (before == txn->mt_next_pgno) return false; - if (txn->tw.spill_pages) + if (txn->tw.spilled.list) /* Squash deleted pagenums if we refunded any */ spill_purge(txn); @@ -7924,9 +8215,9 @@ static __inline bool txn_refund(MDBX_txn *txn) { #endif /* MDBX_ENABLE_REFUND */ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - pgno_t npages) { + size_t npages) { MDBX_env *const env = txn->mt_env; - DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); + DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno); eASSERT(env, pgno >= NUM_METAS && npages); if (!IS_FROZEN(txn, mp)) { const size_t bytes = pgno2bytes(env, npages); @@ -7937,7 +8228,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } else { struct iovec iov[MDBX_AUXILARY_IOV_MAX]; iov[0].iov_len = env->me_psize; - iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; + iov[0].iov_base = ptr_disp(env->me_pbuf, env->me_psize); size_t iov_off = pgno2bytes(env, pgno), n = 1; while (--npages) { iov[n] = iov[0]; @@ -7951,28 +8242,52 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } } -/* Remove page from dirty list */ -static __inline void page_wash(MDBX_txn *txn, const size_t di, - MDBX_page *const mp, const pgno_t npages) { +/* Remove page from dirty list, etc */ +static __inline void page_wash(MDBX_txn *txn, size_t di, MDBX_page *const mp, + const size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, di && di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - dpl_remove_ex(txn, di, npages); - txn->tw.dirtyroom++; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_BAD; + + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, + MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp)); + if (!MDBX_AVOID_MSYNC || di) { + dpl_remove_ex(txn, di, npages); + txn->tw.dirtyroom++; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + dpage_free(txn->mt_env, mp, npages); + return; + } + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di); + txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) + ? npages + : txn->tw.writemap_dirty_npages; + } VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); - if (txn->mt_flags & MDBX_WRITEMAP) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - } else - dpage_free(txn->mt_env, mp, npages); + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); +} + +static __inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) { + /* TODO: + * 1) при включенной "экономии последовательностей" проверить, что + * страница не примыкает к какой-либо из уже находящийся в reclaimed. + * 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать + половину в reclaimed. */ + return txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit); } /* Retire, loosen or free a single page. @@ -8001,9 +8316,15 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * requires support the list of dirty pages and avoid explicit spilling. * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ - size_t di = 0, si = 0; - pgno_t npages = 1; - bool is_frozen = false, is_spilled = false, is_shadowed = false; + size_t di = 0, si = 0, npages = 1; + enum page_status { + unknown, + frozen, + spilled, + shadowed, + modifable + } status = unknown; + if (unlikely(!mp)) { if (ASSERT_ENABLED() && pageflags) { pgr_t check; @@ -8015,10 +8336,10 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pageflags & P_FROZEN) { - is_frozen = true; + status = frozen; if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - tASSERT(txn, !search_spilled(scan, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(scan, pgno)); tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); } } @@ -8027,24 +8348,25 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; tASSERT(txn, IS_MODIFIABLE(txn, mp)); + status = modifable; goto status_done; } if ((si = search_spilled(txn, pgno)) != 0) { - is_spilled = true; + status = spilled; goto status_done; } for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { if (dpl_exist(parent, pgno)) { - is_shadowed = true; + status = shadowed; goto status_done; } if (search_spilled(parent, pgno)) { - is_spilled = true; + status = spilled; goto status_done; } } - is_frozen = true; + status = frozen; goto status_done; } @@ -8056,27 +8378,28 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, pageflags = mp->mp_flags; } - is_frozen = IS_FROZEN(txn, mp); - if (!is_frozen) { - const bool is_dirty = IS_MODIFIABLE(txn, mp); - is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); - is_shadowed = IS_SHADOWED(txn, mp); - if (is_dirty) { - tASSERT(txn, !is_spilled); - tASSERT(txn, !search_spilled(txn, pgno)); - tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || - (txn->mt_flags & MDBX_WRITEMAP)); - } else { - tASSERT(txn, !debug_dpl_find(txn, pgno)); - } - - di = (is_dirty && txn->tw.dirtylist) ? dpl_exist(txn, pgno) : 0; - si = is_spilled ? search_spilled(txn, pgno) : 0; - tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); - } else { + if (IS_FROZEN(txn, mp)) { + status = frozen; tASSERT(txn, !IS_MODIFIABLE(txn, mp)); tASSERT(txn, !IS_SPILLED(txn, mp)); tASSERT(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_MODIFIABLE(txn, mp)) { + status = modifable; + if (txn->tw.dirtylist) + di = dpl_exist(txn, pgno); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) || !IS_SPILLED(txn, mp)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_SHADOWED(txn, mp)) { + status = shadowed; + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + } else { + tASSERT(txn, IS_SPILLED(txn, mp)); + status = spilled; + si = search_spilled(txn, pgno); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } status_done: @@ -8097,12 +8420,12 @@ status_done: } else { npages = mp->mp_pages; cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); - mc->mc_db->md_overflow_pages -= npages; + mc->mc_db->md_overflow_pages -= (pgno_t)npages; } - if (is_frozen) { + if (status == frozen) { retire: - DEBUG("retire %u page %" PRIaPGNO, npages, pgno); + DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); tASSERT(txn, dirtylist_check(txn)); return rc; @@ -8113,7 +8436,7 @@ status_done: * нераспределенного "хвоста" БД сдвигается только при их коммите. */ if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { const char *kind = nullptr; - if (di) { + if (status == modifable) { /* Страница испачкана в этой транзакции, но до этого могла быть * аллоцирована, испачкана и пролита в одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ @@ -8125,10 +8448,8 @@ status_done: * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; + tASSERT(txn, status == spilled); spill_remove(txn, si, npages); - } else if (txn->mt_flags & MDBX_WRITEMAP) { - kind = "writemap"; - tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной * из родительских транзакций. @@ -8140,38 +8461,36 @@ status_done: parent = parent->mt_parent) { if (search_spilled(parent, pgno)) { kind = "parent-spilled"; - tASSERT(txn, is_spilled); + tASSERT(txn, status == spilled); break; } if (mp == debug_dpl_find(parent, pgno)) { kind = "parent-dirty"; - tASSERT(txn, !is_spilled); + tASSERT(txn, status == shadowed); break; } } tASSERT(txn, kind != nullptr); } - tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + tASSERT(txn, status == spilled || status == shadowed); } - DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; txn_refund(txn); return MDBX_SUCCESS; } - if (di) { + if (status == modifable) { /* Dirty page from this transaction */ /* If suitable we can reuse it through loose list */ - if (likely(npages == 1 && - txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && - (!MDBX_ENABLE_REFUND || - /* skip pages near to the end in favor of compactification */ - txn->mt_next_pgno > - pgno + txn->mt_env->me_options.dp_loose_limit || - txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { + if (likely(npages == 1 && suitable4loose(txn, pgno)) && + (di || !txn->tw.dirtylist)) { DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_LOOSE; - mp->mp_next = txn->tw.loose_pages; + mp_next(mp) = txn->tw.loose_pages; txn->tw.loose_pages = mp; txn->tw.loose_count++; #if MDBX_ENABLE_REFUND @@ -8179,8 +8498,6 @@ status_done: ? pgno + 2 : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->mt_env->me_psize - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), @@ -8218,11 +8535,12 @@ status_done: } } skip_invalidate: - /* Remove from dirty list */ + + /* wash dirty page */ page_wash(txn, di, mp, npages); reclaim: - DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); + DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno); rc = pnl_insert_range(&txn->tw.relist, pgno, npages); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -8248,7 +8566,7 @@ status_done: goto reclaim; } - if (is_shadowed) { + if (status == shadowed) { /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ if (ASSERT_ENABLED()) { const MDBX_page *parent_dp = nullptr; @@ -8282,6 +8600,7 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { typedef struct iov_ctx { MDBX_env *env; osal_ioring_t *ior; + mdbx_filehandle_t fd; int err; #ifndef MDBX_NEED_WRITTEN_RANGE #define MDBX_NEED_WRITTEN_RANGE 1 @@ -8294,10 +8613,17 @@ typedef struct iov_ctx { } iov_ctx_t; __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, - size_t items, size_t npages) { + size_t items, size_t npages, + mdbx_filehandle_t fd, + bool check_coherence) { ctx->env = txn->mt_env; ctx->ior = &txn->mt_env->me_ioring; - ctx->err = osal_ioring_reserve(ctx->ior, items, + ctx->fd = fd; + ctx->coherency_timestamp = + (check_coherence || txn->mt_env->me_lck->mti_pgop_stat.incoherence.weak) + ? 0 + : UINT64_MAX /* не выполнять сверку */; + ctx->err = osal_ioring_prepare(ctx->ior, items, pgno_align2os_bytes(txn->mt_env, npages)); if (likely(ctx->err == MDBX_SUCCESS)) { #if MDBX_NEED_WRITTEN_RANGE @@ -8324,18 +8650,72 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); if (likely(ctx->err == MDBX_SUCCESS)) { - VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes); - osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); - const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); + const MDBX_page *const rp = ptr_disp(env->me_map, offset); + VALGRIND_MAKE_MEM_DEFINED(rp, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); + osal_flush_incoherent_mmap(rp, bytes, env->me_os_psize); /* check with timeout as the workaround - * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ - if (unlikely(memcmp(wp, rp, bytes))) { + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 + * + * Проблема проявляется только при неупорядоченности: если записанная + * последней мета-страница "обгоняет" ранее записанные, т.е. когда + * записанное в файл позже становится видимым в отображении раньше, + * чем записанное ранее. + * + * Исходно здесь всегда выполнялась полная сверка. Это давало полную + * гарантию защиты от проявления проблемы, но порождало накладные расходы. + * В некоторых сценариях наблюдалось снижение производительности до 10-15%, + * а в синтетических тестах до 30%. Конечно никто не вникал в причины, + * а просто останавливался на мнении "libmdbx не быстрее LMDB", + * например: https://clck.ru/3386er + * + * Поэтому после серии экспериментов и тестов реализовано следующее: + * 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1 + * можно включить полную сверку после записи. + * Остальные пункты являются взвешенным компромиссом между полной + * гарантией обнаружения проблемы и бесполезными затратами на системах + * без этого недостатка. + * 1. При старте транзакций проверяется соответствие выбранной мета-страницы + * корневым страницам b-tree проверяется. Эта проверка показала себя + * достаточной без сверки после записи. При обнаружении "некогерентности" + * эти случаи подсчитываются, а при их ненулевом счетчике выполняется + * полная сверка. Таким образом, произойдет переключение в режим полной + * сверки, если показавшая себя достаточной проверка заметит проявление + * проблемы хоты-бы раз. + * 2. Сверка не выполняется при фиксации транзакции, так как: + * - при наличии проблемы "не-когерентности" (при отложенном копировании + * или обновлении PTE, после возврата из write-syscall), проверка + * в этом процессе не гарантирует актуальность данных в другом + * процессе, который может запустить транзакцию сразу после коммита; + * - сверка только последнего блока позволяет почти восстановить + * производительность в больших транзакциях, но одновременно размывает + * уверенность в отсутствии сбоев, чем обесценивает всю затею; + * - после записи данных будет записана мета-страница, соответствие + * которой корневым страницам b-tree проверяется при старте + * транзакций, и только эта проверка показала себя достаточной; + * 3. При спиллинге производится полная сверка записанных страниц. Тут был + * соблазн сверять не полностью, а например начало и конец каждого блока. + * Но при спиллинге возможна ситуация повторного вытеснения страниц, в + * том числе large/overflow. При этом возникает риск прочитать в текущей + * транзакции старую версию страницы, до повторной записи. В этом случае + * могут возникать крайне редкие невоспроизводимые ошибки. С учетом того + * что спиллинг выполняет крайне редко, решено отказаться от экономии + * в пользу надежности. */ +#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY +#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0 +#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */ + if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || + ctx->coherency_timestamp != UINT64_MAX) && + unlikely(memcmp(wp, rp, bytes))) { ctx->coherency_timestamp = 0; + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); do - if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) != + if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno, env) != MDBX_RESULT_TRUE) { ctx->err = MDBX_PROBLEM; break; @@ -8350,11 +8730,12 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, do { eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; + size_t npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; size_t chunk = pgno2bytes(env, npages); eASSERT(env, bytes >= chunk); + MDBX_page *next = ptr_disp(wp, chunk); dpage_free(env, wp, npages); - wp = (MDBX_page *)((char *)wp + chunk); + wp = next; offset += chunk; bytes -= chunk; } while (bytes); @@ -8369,12 +8750,10 @@ static void iov_complete(iov_ctx_t *ctx) { __must_check_result static int iov_write(iov_ctx_t *ctx) { eASSERT(ctx->env, !iov_empty(ctx)); - osal_ioring_write_result_t r = osal_ioring_write(ctx->ior); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd); #if MDBX_ENABLE_PGOP_STAT ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (!ctx->env->me_lck->mti_eoos_timestamp.weak) - ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); ctx->err = r.err; if (unlikely(ctx->err != MDBX_SUCCESS)) ERROR("Write error: %s", mdbx_strerror(ctx->err)); @@ -8383,7 +8762,7 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) { } __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, - MDBX_page *dp, pgno_t npages) { + MDBX_page *dp, size_t npages) { MDBX_env *const env = txn->mt_env; tASSERT(txn, ctx->err == MDBX_SUCCESS); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); @@ -8427,47 +8806,55 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, #if MDBX_NEED_WRITTEN_RANGE ctx->flush_begin = (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + (pgno_t)npages) ? ctx->flush_end - : dp->mp_pgno + npages; + : dp->mp_pgno + (pgno_t)npages; #endif /* MDBX_NEED_WRITTEN_RANGE */ - env->me_lck->mti_unsynced_pages.weak += npages; return MDBX_SUCCESS; } static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - const pgno_t npages) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP) || MDBX_AVOID_MSYNC); + const size_t npages) { + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ const pgno_t pgno = dp->mp_pgno; int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS) && - (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) - err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); + if (likely(err == MDBX_SUCCESS)) + err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); return err; } /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); size_t keep = 0; - while (mc->mc_flags & C_INITIALIZED) { - for (size_t i = 0; i < mc->mc_snum; ++i) { - const MDBX_page *mp = mc->mc_pg[i]; - if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { + while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { + tASSERT(txn, mc->mc_top == mc->mc_snum - 1); + const MDBX_page *mp; + size_t i = 0; + do { + mp = mc->mc_pg[i]; + tASSERT(txn, !IS_SUBP(mp)); + if (IS_MODIFIABLE(txn, mp)) { size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - dpl_age(txn, n)) { - txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; + /* не считаем дважды */ dpl_age(txn, n)) { + size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, + -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + tASSERT(txn, dpl_age(txn, n) == 0); ++keep; } } - } - if (!mc->mc_xcursor) + } while (++i < mc->mc_snum); + + tASSERT(txn, IS_LEAF(mp)); + if (!mc->mc_xcursor || mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) + break; + if (!(node_flags(page_node(mp, mc->mc_ki[mc->mc_top])) & F_SUBDATA)) break; mc = &mc->mc_xcursor->mx_cursor; } @@ -8475,8 +8862,8 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { } static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && @@ -8491,24 +8878,21 @@ static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * 0 = should be spilled; * ... * > 255 = must not be spilled. */ -static unsigned spill_prio(const MDBX_txn *txn, const size_t i, - const uint32_t reciprocal) { +MDBX_NOTHROW_PURE_FUNCTION static unsigned +spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; const uint32_t age = dpl_age(txn, i); - const unsigned npages = dpl_npages(dl, i); + const size_t npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { - DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); return 256; } MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - DEBUG("skip %s %u page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" - : (dp->mp_flags & P_LOOSE) ? "loose" - : "parent-spilled", - npages, pgno); + DEBUG("skip %s %zu page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); return 256; } @@ -8518,7 +8902,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { do if (intersect_spilled(parent, pgno, npages)) { - DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_SPILLED; return 256; } @@ -8532,15 +8916,15 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ - uint32_t factor = npages | npages >> 1; + size_t factor = npages | npages >> 1; factor |= factor >> 2; factor |= factor >> 4; factor |= factor >> 8; factor |= factor >> 16; - factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; tASSERT(txn, factor < 256 && factor < (256 - prio)); - return prio = factor; + return prio = (unsigned)factor; } /* Spill pages from the dirty list back to disk. @@ -8578,12 +8962,15 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const size_t need) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, !m0 || cursor_is_tracked(m0)); - intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; - intptr_t wanna_spill_npages = - need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count - - txn->mt_env->me_options.dp_limit; + const intptr_t wanna_spill_entries = + txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; + const intptr_t wanna_spill_npages = + need + + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count - txn->mt_env->me_options.dp_limit; /* production mode */ if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) @@ -8611,7 +8998,7 @@ static size_t spill_gate(const MDBX_env *env, intptr_t part, : 0); part = (part < spill_max) ? part : spill_max; part = (part > spill_min) ? part : spill_min; - eASSERT(env, part > 0 && (size_t)part <= total); + eASSERT(env, part >= 0 && (size_t)part <= total); return (size_t)part; } @@ -8620,15 +9007,19 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_npages, const size_t need) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); int rc = MDBX_SUCCESS; - if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count)) + if (unlikely(txn->tw.loose_count >= + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages))) goto done; - const size_t dirty_entries = txn->tw.dirtylist->length - txn->tw.loose_count; + const size_t dirty_entries = + txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; const size_t dirty_npages = - txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count; const size_t need_spill_entries = spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); const size_t need_spill_npages = @@ -8640,51 +9031,63 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (!need_spill) goto done; -#if !MDBX_AVOID_MSYNC if (txn->mt_flags & MDBX_WRITEMAP) { NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages); - tASSERT(txn, txn->tw.spill_pages == nullptr); const MDBX_env *env = txn->mt_env; + tASSERT(txn, txn->tw.spilled.list == nullptr); rc = osal_msync(&txn->mt_env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_NONE); + pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; +#if MDBX_AVOID_MSYNC + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); + tASSERT(txn, dirtylist_check(txn)); + env->me_lck->mti_unsynced_pages.weak += + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; dpl_clear(txn->tw.dirtylist); txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; - for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = lp->mp_next) { + for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); rc = dpl_append(txn, lp->mp_pgno, lp, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } + tASSERT(txn, dirtylist_check(txn)); +#else + tASSERT(txn, txn->tw.dirtylist == nullptr); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; + txn->tw.writemap_dirty_npages = 0; +#endif /* MDBX_AVOID_MSYNC */ goto done; } -#endif /* MDBX_AVOID_MSYNC */ NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages); + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - if (!txn->tw.spill_pages) { - txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = pnl_alloc(need_spill); - if (unlikely(!txn->tw.spill_pages)) { - rc = MDBX_ENOMEM; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return rc; - } - } else { - /* purge deleted slots */ - spill_purge(txn); - rc = pnl_reserve(&txn->tw.spill_pages, need_spill); - (void)rc /* ignore since the resulting list may be shorter - and pnl_append() will increase pnl on demand */ - ; + if (!txn->tw.spilled.list) { + txn->tw.spilled.least_removed = INT_MAX; + txn->tw.spilled.list = pnl_alloc(need_spill); + if (unlikely(!txn->tw.spilled.list)) { + rc = MDBX_ENOMEM; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; } + } else { + /* purge deleted slots */ + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spilled.list, need_spill); + (void)rc /* ignore since the resulting list may be shorter + and pnl_append() will increase pnl on demand */ + ; } /* Сортируем чтобы запись на диск была полее последовательна */ @@ -8697,7 +9100,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - ERROR("all %zu dirty pages are unspillable since referenced " + ERROR("all %zu dirty pages are unspillable since referenced " "by a cursor(s), use fewer cursors or increase " "MDBX_opt_txn_dp_limit", unspillable); @@ -8744,6 +9147,11 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (size_t i = 1; i <= dl->length; ++i) { const unsigned prio = spill_prio(txn, i, reciprocal); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + TRACE("page %" PRIaPGNO + ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", + dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', + dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); if (prio < 256) { radix_entries[prio] += 1; spillable_entries += 1; @@ -8785,61 +9193,45 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); iov_ctx_t ctx; - rc = iov_init(txn, &ctx, amount_entries, amount_npages); + rc = + iov_init(txn, &ctx, amount_entries, amount_npages, +#if defined(_WIN32) || defined(_WIN64) + txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd : +#endif + txn->mt_env->me_lazy_fd, + true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned prev_prio = 256, prio; - size_t r, w; - for (w = 0, r = 1; - r <= dl->length && (spilled_entries < need_spill_entries || - spilled_npages < need_spill_npages); - prev_prio = prio, ++r) { - prio = spill_prio(txn, r, reciprocal); - MDBX_page *const dp = dl->items[r].ptr; - if (prio < prio2adjacent) { - const pgno_t pgno = dl->items[r].pgno; - const unsigned npages = dpl_npages(dl, r); - if (prio <= prio2spill) { - if (prev_prio < prio2adjacent && prev_prio > prio2spill && - dpl_endpgno(dl, r - 1) == pgno) { - DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), - prev_prio); - --w; - const unsigned co_npages = dpl_npages(dl, r - 1); - rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, co_npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled_entries; - spilled_npages += co_npages; - } + size_t r = 0, w = 0; + pgno_t last = 0; + while (r < dl->length && (spilled_entries < need_spill_entries || + spilled_npages < need_spill_npages)) { + dl->items[++w] = dl->items[++r]; + unsigned prio = spill_prio(txn, w, reciprocal); + if (prio > prio2spill && + (prio >= prio2adjacent || last != dl->items[w].pgno)) + continue; - DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, - dp->mp_pgno, dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled_entries; - spilled_npages += npages; - continue; - } + const size_t e = w; + last = dpl_endpgno(dl, w); + while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && + spill_prio(txn, w, reciprocal) < prio2adjacent) + ; - if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - DEBUG("co-spill %u next-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - npages, dp->mp_pgno, dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - prio = prev_prio /* to continue co-spilling next adjacent pages */; - ++spilled_entries; - spilled_npages += npages; - continue; - } + for (size_t i = w; ++i <= e;) { + const unsigned npages = dpl_npages(dl, i); + prio = spill_prio(txn, i, reciprocal); + DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", + (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, + dpl_age(txn, i), prio); + tASSERT(txn, prio < 256); + ++spilled_entries; + spilled_npages += npages; + rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); + if (unlikely(rc != MDBX_SUCCESS)) + goto failed; } - dl->items[++w] = dl->items[r]; } VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, @@ -8847,9 +9239,10 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); tASSERT(txn, spilled_npages >= spilled_entries); - while (r <= dl->length) - dl->items[++w] = dl->items[r++]; - tASSERT(txn, r - 1 - w == spilled_entries); + failed: + while (r < dl->length) + dl->items[++w] = dl->items[++r]; + tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += spilled_entries; @@ -8863,10 +9256,9 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); - txn->mt_flags |= MDBX_TXN_SPILLS; - } + txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; + pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); + txn->mt_flags |= MDBX_TXN_SPILLS; NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", spilled_entries, spilled_npages, txn->tw.dirtyroom); } else { @@ -8874,7 +9266,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, for (size_t i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; VERBOSE( - "dirtylist[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + "unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), spill_prio(txn, i, reciprocal)); } @@ -8901,44 +9293,6 @@ done: : MDBX_TXN_FULL; } -static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { - MDBX_txn *txn = mc->mc_txn; - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - return MDBX_SUCCESS; - } - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - - /* Estimate how much space this operation will take: */ - /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ - size_t need = CURSOR_STACK + 3; - /* 2) GC/FreeDB for any payload */ - if (mc->mc_dbi > FREE_DBI) { - need += txn->mt_dbs[FREE_DBI].md_depth + 3; - /* 3) Named DBs also dirty the main DB */ - if (mc->mc_dbi > MAIN_DBI) - need += txn->mt_dbs[MAIN_DBI].md_depth + 3; - } -#if xMDBX_DEBUG_SPILLING != 2 - /* production mode */ - /* 4) Double the page chain estimation - * for extensively splitting, rebalance and merging */ - need += need; - /* 5) Factor the key+data which to be put in */ - need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; -#else - /* debug mode */ - (void)key; - (void)data; - mc->mc_txn->mt_env->debug_dirtied_est = ++need; - mc->mc_txn->mt_env->debug_dirtied_act = 0; -#endif /* xMDBX_DEBUG_SPILLING == 2 */ - - return txn_spill(txn, mc, need); -} - /*----------------------------------------------------------------------------*/ static bool meta_bootid_match(const MDBX_meta *meta) { @@ -9232,6 +9586,7 @@ meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, const meta_troika_t *troika) { const uint8_t tail = troika->tail_and_flags & 3; + MDBX_ANALYSIS_ASSUME(tail < NUM_METAS); meta_ptr_t r; r.txnid = troika->txnid[tail]; r.ptr_v = METAPAGE(env, tail); @@ -9278,6 +9633,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; return env->me_lck->mti_oldest_reader.weak = steady; } @@ -9336,8 +9692,8 @@ static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { } /* Find largest mvcc-snapshot still referenced. */ -__cold static pgno_t find_largest_snapshot(const MDBX_env *env, - pgno_t last_used_page) { +static pgno_t find_largest_snapshot(const MDBX_env *env, + pgno_t last_used_page) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* check for exclusive without-lck mode */)) { retry:; @@ -9366,10 +9722,13 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, /* Add a page to the txn's dirty list */ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + mp->mp_txnid = txn->mt_front; if (!txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + txn->tw.writemap_dirty_npages += npages; + tASSERT(txn, txn->tw.spilled.list == nullptr); return MDBX_SUCCESS; } tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -9382,30 +9741,27 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, #endif /* xMDBX_DEBUG_SPILLING == 2 */ int rc; - mp->mp_txnid = txn->mt_front; if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { - MDBX_page *loose = txn->tw.loose_pages; - DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = pnl_insert_range(&txn->tw.relist, loose->mp_pgno, 1); + MDBX_page *lp = txn->tw.loose_pages; + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->mp_pgno); + rc = pnl_insert_range(&txn->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - size_t di = dpl_search(txn, loose->mp_pgno); - tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); + size_t di = dpl_search(txn, lp->mp_pgno); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp); dpl_remove(txn, di); - txn->tw.loose_pages = loose->mp_next; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; txn->tw.dirtyroom++; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - dpage_free(txn->mt_env, loose, 1); - } + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dpage_free(txn->mt_env, lp, 1); } else { ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, mp, npages); - } return MDBX_TXN_FULL; } } @@ -9421,6 +9777,11 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, return MDBX_SUCCESS; } +static void mincore_clean_cache(const MDBX_env *const env) { + memset(env->me_lck->mti_mincore_cache.begin, -1, + sizeof(env->me_lck->mti_mincore_cache.begin)); +} + #if !(defined(_WIN32) || defined(_WIN64)) MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #ifdef ENOSYS @@ -9483,16 +9844,15 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, #endif /* F_RDAHEAD */ int err; + void *const ptr = ptr_disp(env->me_map, offset); if (enable) { #if defined(MADV_NORMAL) - err = madvise(env->me_map + offset, length, MADV_NORMAL) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_NORMAL) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_NORMAL) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_NORMAL)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) @@ -9521,20 +9881,18 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( env->me_lazy_fd, F_RDADVISE, &hint); #elif defined(MADV_WILLNEED) - err = madvise(env->me_map + offset, length, MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = madvise(ptr, length, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_WILLNEED) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(_WIN32) || defined(_WIN64) if (mdbx_PrefetchVirtualMemory) { WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map + offset; + hint.VirtualAddress = ptr; hint.NumberOfBytes = length; (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); } @@ -9548,15 +9906,14 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, #endif } } else { + mincore_clean_cache(env); #if defined(MADV_RANDOM) - err = madvise(env->me_map + offset, length, MADV_RANDOM) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_RANDOM) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_RANDOM) @@ -9630,14 +9987,15 @@ __cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, munlock_begin % env->me_os_psize == 0 && munlock_size % env->me_os_psize == 0); #if defined(_WIN32) || defined(_WIN64) - err = VirtualUnlock(env->me_map + munlock_begin, munlock_size) + err = VirtualUnlock(ptr_disp(env->me_map, munlock_begin), munlock_size) ? MDBX_SUCCESS : (int)GetLastError(); if (err == ERROR_NOT_LOCKED) err = MDBX_SUCCESS; #elif defined(_POSIX_MEMLOCK_RANGE) - err = munlock(env->me_map + munlock_begin, munlock_size) ? errno - : MDBX_SUCCESS; + err = munlock(ptr_disp(env->me_map, munlock_begin), munlock_size) + ? errno + : MDBX_SUCCESS; #endif if (likely(err == MDBX_SUCCESS)) update_mlcnt(env, aligned_pgno, false); @@ -9656,21 +10014,79 @@ __cold static void munlock_all(const MDBX_env *env) { munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); } -__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, const pgno_t limit_pgno, - const bool implicit) { - const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); - const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); +__cold static unsigned default_rp_augment_limit(const MDBX_env *env) { + /* default rp_augment_limit = ceil(npages / gold_ratio) */ + const size_t augment = (env->me_dbgeo.now >> (env->me_psize2log + 10)) * 633u; + eASSERT(env, augment < MDBX_PGL_LIMIT); + return pnl_bytes2size(pnl_size2bytes( + (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); +} + +static bool default_prefault_write(const MDBX_env *env) { + return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && + (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; +} + +static void adjust_defaults(MDBX_env *env) { + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + if (!env->me_options.flags.non_auto.prefault_write) + env->me_options.prefault_write = default_prefault_write(env); + + const size_t basis = env->me_dbgeo.now; + /* TODO: use options? */ + const unsigned factor = 9; + size_t threshold = (basis < ((size_t)65536 << factor)) + ? 65536 /* minimal threshold */ + : (basis > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : basis >> factor; + threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink) + ? threshold + : env->me_dbgeo.shrink; + + env->me_madv_threshold = + bytes2pgno(env, bytes_align2os_bytes(env, threshold)); +} + +enum resize_mode { implicit_grow, impilict_shrink, explicit_resize }; + +__cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, + const pgno_t size_pgno, pgno_t limit_pgno, + const enum resize_mode mode) { + /* Acquire guard to avoid collision between read and write txns + * around me_dbgeo and me_dxb_mmap */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireExclusive(&env->me_remap_guard); + int rc = MDBX_SUCCESS; + mdbx_handle_array_t *suspended = NULL; + mdbx_handle_array_t array_onstack; +#else + int rc = osal_fastmutex_acquire(&env->me_remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; +#endif + const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_limit = env->me_dxb_mmap.limit; + const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); + eASSERT(env, limit_pgno >= size_pgno); + eASSERT(env, size_pgno >= used_pgno); + if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { + /* The actual mapsize may be less since the geo.upper may be changed + * by other process. Avoids remapping until it necessary. */ + limit_pgno = prev_limit_pgno; + } + const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); + const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) - const void *const prev_addr = env->me_map; + const void *const prev_map = env->me_dxb_mmap.base; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ - VERBOSE("resize datafile/mapping: " + VERBOSE("resize/%d datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR, - prev_size, size_bytes, prev_limit, limit_bytes); + mode, prev_size, size_bytes, prev_limit, limit_bytes); eASSERT(env, limit_bytes >= size_bytes); eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); @@ -9678,20 +10094,18 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, unsigned mresize_flags = env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); -#if defined(_WIN32) || defined(_WIN64) - /* Acquire guard in exclusive mode for: - * - to avoid collision between read and write txns around env->me_dbgeo; - * - to avoid attachment of new reading threads (see osal_rdt_lock); */ - osal_srwlock_AcquireExclusive(&env->me_remap_guard); - mdbx_handle_array_t *suspended = NULL; - mdbx_handle_array_t array_onstack; - int rc = MDBX_SUCCESS; + if (mode >= impilict_shrink) + mresize_flags |= MDBX_SHRINK_ALLOWED; + if (limit_bytes == env->me_dxb_mmap.limit && size_bytes == env->me_dxb_mmap.current && size_bytes == env->me_dxb_mmap.filesize) goto bailout; - if ((env->me_flags & MDBX_NOTLS) == 0) { +#if defined(_WIN32) || defined(_WIN64) + if ((env->me_flags & MDBX_NOTLS) == 0 && + ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->me_dxb_mmap.limit)) { /* 1) Windows allows only extending a read-write section, but not a * corresponding mapped view. Therefore in other cases we must suspend * the local threads for safe remap. @@ -9709,64 +10123,61 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } - mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; } #else /* Windows */ - /* Acquire guard to avoid collision between read and write txns - * around env->me_dbgeo */ - int rc = osal_fastmutex_acquire(&env->me_remap_guard); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (limit_bytes == env->me_dxb_mmap.limit && - size_bytes == env->me_dxb_mmap.current) - goto bailout; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && - lck && !implicit) { - int err = osal_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; - goto bailout; - } - - /* looking for readers from this process */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - eASSERT(env, !implicit); + if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit && + !(env->me_flags & MDBX_NOTLS)) { mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - for (size_t i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - osal_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; + if (lck) { + int err = osal_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } + + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + osal_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } } } } #endif /* ! Windows */ - if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_NONE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - const pgno_t aligned_munlock_pgno = (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) ? 0 : bytes2pgno(env, size_bytes); + if (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) { + mincore_clean_cache(env); + if ((env->me_flags & MDBX_WRITEMAP) && + env->me_lck->mti_unsynced_pages.weak) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } munlock_after(env, aligned_munlock_pgno, size_bytes); #if MDBX_ENABLE_MADVISE - if (size_bytes < prev_size) { + if (size_bytes < prev_size && mode > implicit_grow) { NOTICE("resize-MADV_%s %u..%u", (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, bytes2pgno(env, prev_size)); @@ -9775,20 +10186,20 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) - rc = - madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, + MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; #elif defined(POSIX_MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, + rc = ignore_enosys(posix_madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, POSIX_MADV_DONTNEED)); #elif defined(POSIX_FADV_DONTNEED) @@ -9825,13 +10236,16 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, if (rc == MDBX_SUCCESS) { eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - eASSERT(env, size_bytes == env->me_dxb_mmap.current); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); env->me_lck->mti_discarded_tail.weak = size_pgno; const bool readahead = !(env->me_flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); const bool force = limit_bytes != prev_limit || - env->me_dxb_mmap.address != prev_addr + env->me_dxb_mmap.base != prev_map #if defined(_WIN32) || defined(_WIN64) || prev_size > size_bytes #endif /* Windows */ @@ -9844,9 +10258,16 @@ bailout: if (rc == MDBX_SUCCESS) { eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - eASSERT(env, size_bytes == env->me_dxb_mmap.current); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); + /* update env-geo to avoid influences */ + env->me_dbgeo.now = env->me_dxb_mmap.current; + env->me_dbgeo.upper = env->me_dxb_mmap.limit; + adjust_defaults(env); #ifdef MDBX_USE_VALGRIND - if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { + if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; if (env->me_dxb_mmap.limit) @@ -9866,7 +10287,7 @@ bailout: "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", prev_size, size_bytes, prev_limit, limit_bytes, rc); } - if (!env->me_dxb_mmap.address) { + if (!env->me_dxb_mmap.base) { env->me_flags |= MDBX_FATAL_ERROR; if (env->me_txn) env->me_txn->mt_flags |= MDBX_TXN_ERROR; @@ -9895,21 +10316,6 @@ bailout: return rc; } -__cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno) { - const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); - eASSERT(env, mapped_pgno >= used_pgno); - return map_resize( - env, used_pgno, size_pgno, - (size_pgno > mapped_pgno) - ? limit_pgno - : /* The actual mapsize may be less since the geo.upper may be changed - by other process. So, avoids remapping until it necessary. */ - mapped_pgno, - true); -} - static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, const pgno_t pgno) { MDBX_meta *const meta = METAPAGE(env, pgno); @@ -9922,7 +10328,7 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, const uint64_t wipe = MDBX_DATASIGN_NONE; const void *ptr = &wipe; size_t bytes = sizeof(meta->mm_sign), - offset = (uint8_t *)&meta->mm_sign - env->me_map; + offset = ptr_dist(&meta->mm_sign, env->me_map); if (env->me_flags & MDBX_WRITEMAP) { unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); @@ -9936,7 +10342,7 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, return err; } ptr = data_page(meta); - offset = (uint8_t *)ptr - env->me_map; + offset = ptr_dist(ptr, env->me_map); bytes = env->me_psize; } @@ -10092,6 +10498,8 @@ MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { } #endif /* _MSC_VER */ +#if !MDBX_PNL_ASCENDING + #if !defined(MDBX_ATTRIBUTE_TARGET) && \ (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) @@ -10358,8 +10766,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #ifndef __SANITIZE_ADDRESS__ found: #endif /* __SANITIZE_ADDRESS__ */ - return (pgno_t *)((char *)range - - (__builtin_clzl(mask) >> sizeof(size_t) / 4)); + return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); } range -= 4; } while (range > detent + 3); @@ -10394,7 +10801,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) #define scan4seq_default scan4seq_avx512bw -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) #define scan4seq_default scan4seq_avx2 #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) @@ -10405,28 +10812,30 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, /* Choosing of another variants should be added here. */ #endif /* scan4seq_default */ +#endif /* MDBX_PNL_ASCENDING */ + #ifndef scan4seq_default #define scan4seq_default scan4seq_fallback #endif /* scan4seq_default */ -#ifdef scan4seq -/* The scan4seq() is the best or no alternatives */ +#ifdef scan4seq_impl +/* The scan4seq_impl() is the best or no alternatives */ #elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS -/* The scan4seq_default() will be used since no cpu-features detection support +/* The scan4seq_default() will be used since no cpu-features detection support * from compiler. Please don't ask to implement cpuid-based detection and don't * make such PRs. */ -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #else /* Selecting the most appropriate implementation at runtime, * depending on the available CPU features. */ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq); -static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, - const size_t seq) = scan4seq_resolver; +static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, + const size_t seq) = scan4seq_resolver; static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq) { - pgno_t *(*choice)(pgno_t * range, const size_t len, const size_t seq) = + pgno_t *(*choice)(pgno_t *range, const size_t len, const size_t seq) = nullptr; #if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ __GNUC_PREREQ(4, 8) @@ -10445,10 +10854,10 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, choice = scan4seq_avx512bw; #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ /* Choosing of another variants should be added here. */ - scan4seq = choice ? choice : scan4seq_default; - return scan4seq(range, len, seq); + scan4seq_impl = choice ? choice : scan4seq_default; + return scan4seq_impl(range, len, seq); } -#endif /* scan4seq */ +#endif /* scan4seq_impl */ //------------------------------------------------------------------------------ @@ -10468,50 +10877,386 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, * * Returns 0 on success, non-zero on failure.*/ -#define MDBX_ALLOC_GC 1 -#define MDBX_ALLOC_NEW 2 -#define MDBX_ALLOC_COALESCE 4 -#define MDBX_ALLOC_SLOT 8 -#define MDBX_ALLOC_RESERVE 16 -#define MDBX_ALLOC_BACKLOG 32 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -#define MDBX_ALLOC_LIFO 128 +#define MDBX_ALLOC_DEFAULT 0 +#define MDBX_ALLOC_RESERVE 1 +#define MDBX_ALLOC_UNIMPORTANT 2 +#define MDBX_ALLOC_COALESCE 4 /* внутреннее состояние */ +#define MDBX_ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */ +#define MDBX_ALLOC_LIFO 16 /* внутреннее состояние */ -static __inline bool is_gc_usable(const MDBX_txn *txn) { +static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, + const uint8_t flags) { /* If txn is updating the GC, then the retired-list cannot play catch-up with * itself by growing while trying to save it. */ - if (txn->mt_flags & (MDBX_TXN_UPDATE_GC | MDBX_TXN_FROZEN_RE)) + if (mc->mc_dbi == FREE_DBI && !(flags & MDBX_ALLOC_RESERVE) && + !(mc->mc_flags & C_GCU)) return false; - /* avoid (recursive) search inside empty tree and while tree is - updating, https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/31 */ - if (txn->mt_dbs[FREE_DBI].md_entries == 0) - return false; - - /* If our dirty list is already full, we can't touch GC */ - if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) + /* avoid search inside empty tree and while tree is updating, + https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ + if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { + txn->mt_flags |= MDBX_TXN_DRAINED_GC; return false; + } return true; } -static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { - if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - txn->mt_dbs[FREE_DBI].md_flags); - return MDBX_CORRUPTED; - } - return cursor_init(mc, txn, FREE_DBI); +__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); + for (size_t i = 1; i <= len; ++i) + if (txn->tw.lifo_reclaimed[i] == id) + return true; + return false; } -static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, - char flags) { +__hot static pgno_t relist_get_single(MDBX_txn *txn) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + assert(len > 0); + pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist); + const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; + + /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики: + * + * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с + * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов + * попасть под авто-компактификацию. Частично эта тактика уже реализована, но + * для её эффективности требуется явно приоритезировать выделение страниц: + * - поддерживать для relist, для ближних и для дальних страниц; + * - использовать страницы из дальнего списка, если первый пуст, + * а второй слишком большой, либо при пустой GC. + * + * 2. Стараться выделять страницы последовательно. Так записываемые на диск + * регионы будут линейными, что принципиально ускоряет запись на HDD. + * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если + * порядок чтения не совпадает с порядком изменения (иначе говоря, если + * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе + * может ускорить. Однако, последовательности в среднем достаточно редки. + * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ + * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат + * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не + * имеет шансов быть успешной без смены формата БД (Mithril). + * + * 3. Стараться экономить последовательности страниц. Это позволяет избегать + * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или + * обновлении данных требующих более одной страницы. Проблема в том, что без + * информации от приложения библиотека не может знать насколько + * востребованными будут последовательности в ближайшей перспективе, а + * экономия последовательностей "на всякий случай" не только затратна + * сама-по-себе, но и работает во вред. + * + * Поэтому: + * - в TODO добавляется разделение relist на «ближние» и «дальние» страницы, + * с последующей реализацией первой тактики; + * - преимущественное использование последовательностей отправляется + * в MithrilDB как составляющая "HDD frendly" feature; + * - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия + * последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0. + * + * В качестве альтернативы для безусловной «экономии» последовательностей, + * в следующих версиях libmdbx, вероятно, будет предложено + * API для взаимодействия с GC: + * - получение размера GC, включая гистограммы размеров последовательностей + * и близости к концу БД; + * - включение формирования "линейного запаса" для последующего использования + * в рамках текущей транзакции; + * - намеренная загрузка GC в память для коагуляции и "выпрямления"; + * - намеренное копирование данных из страниц в конце БД для последующего + * из освобождения, т.е. контролируемая компактификация по запросу. */ + +#ifndef MDBX_ENABLE_SAVING_SEQUENCES +#define MDBX_ENABLE_SAVING_SEQUENCES 0 +#endif + if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) && + len > 2) { + /* Пытаемся пропускать последовательности при наличии одиночных элементов. + * TODO: необходимо кэшировать пропускаемые последовательности + * чтобы не сканировать список сначала при каждом выделении. */ + pgno_t *scan = target + dir + dir; + size_t left = len; + do { + if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { +#if MDBX_PNL_ASCENDING + target = scan; + break; +#else + /* вырезаем элемент с перемещением хвоста */ + const pgno_t pgno = *scan; + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + while (++scan <= target) + scan[-1] = *scan; + return pgno; +#endif + } + scan += dir; + } while (--left > 2); + } + + const pgno_t pgno = *target; +#if MDBX_PNL_ASCENDING + /* вырезаем элемент с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end; + ++target) + *target = target[1]; +#else + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); +#endif + return pgno; +} + +__hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist); + assert(len >= num && num > 1); + const size_t seq = num - 1; +#if !MDBX_PNL_ASCENDING + if (edge[-(ptrdiff_t)seq] - *edge == seq) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + assert(edge == scan4range_checker(txn->tw.relist, seq)); + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); + return *edge; + } +#endif + pgno_t *target = scan4seq_impl(edge, len, seq); + assert(target == scan4range_checker(txn->tw.relist, seq)); + if (target) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + const pgno_t pgno = *target; + /* вырезаем найденную последовательность с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); +#if MDBX_PNL_ASCENDING + for (const pgno_t *const end = txn->tw.relist + len - num; target <= end; + ++target) + *target = target[num]; +#else + for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;) + target[-(ptrdiff_t)num] = *target; +#endif + return pgno; + } + return 0; +} + +#if MDBX_ENABLE_MINCORE +static __inline bool bit_tas(uint64_t *field, char bit) { + const uint64_t m = UINT64_C(1) << bit; + const bool r = (*field & m) != 0; + *field |= m; + return r; +} + +static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) { + MDBX_lockinfo *const lck = env->me_lck; + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i]; + if (likely(dist >= 0 && dist < 64)) { + const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i]; + const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i]; + do { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } while (--i); + lck->mti_mincore_cache.begin[0] = tmp_begin; + lck->mti_mincore_cache.mask[0] = tmp_mask; + return bit_tas(lck->mti_mincore_cache.mask, (char)dist); + } + } + + size_t pages = 64; + unsigned unit_log = sys_pagesize_ln2; + unsigned shift = 0; + if (env->me_psize > env->me_os_psize) { + unit_log = env->me_psize2log; + shift = env->me_psize2log - sys_pagesize_ln2; + pages <<= shift; + } + + const size_t offset = unit_begin << unit_log; + size_t length = pages << sys_pagesize_ln2; + if (offset + length > env->me_dxb_mmap.current) { + length = env->me_dxb_mmap.current - offset; + pages = length >> sys_pagesize_ln2; + } + +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.mincore.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + uint8_t *const vector = alloca(pages); + if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length, + (void *)vector))) { + NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno); + return false; + } + + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } + lck->mti_mincore_cache.begin[0] = unit_begin; + + uint64_t mask = 0; +#ifdef MINCORE_INCORE + STATIC_ASSERT(MINCORE_INCORE == 1); +#endif + for (size_t i = 0; i < pages; ++i) { + uint64_t bit = (vector[i] & 1) == 0; + bit <<= i >> shift; + mask |= bit; + } + + lck->mti_mincore_cache.mask[0] = ~mask; + return bit_tas(lck->mti_mincore_cache.mask, 0); +} +#endif /* MDBX_ENABLE_MINCORE */ + +MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env, + const pgno_t pgno) { +#if MDBX_ENABLE_MINCORE + const size_t offset_aligned = + floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize); + const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2) + ? env->me_psize2log + : sys_pagesize_ln2; + const size_t unit_begin = offset_aligned >> unit_log2; + eASSERT(env, (unit_begin << unit_log2) == offset_aligned); + const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0]; + if (likely(dist >= 0 && dist < 64)) + return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist); + return mincore_fetch(env, unit_begin); +#else + (void)env; + (void)pgno; + return false; +#endif /* MDBX_ENABLE_MINCORE */ +} + +static __inline pgr_t page_alloc_finalize(MDBX_env *const env, + MDBX_txn *const txn, + const MDBX_cursor *const mc, + const pgno_t pgno, const size_t num) { #if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); size_t majflt_before; const uint64_t cputime_before = osal_cputime(&majflt_before); - uint64_t monotime_shot = 0; + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; +#else + (void)mc; +#endif /* MDBX_ENABLE_PROFGC */ + ENSURE(env, pgno >= NUM_METAS); + + pgr_t ret; + bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0; + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + /* Содержимое выделенной страницы не нужно, но если страница отсутствует + * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет + * к page-fault: + * - прерыванию по отсутствию страницы; + * - переключение контекста в режим ядра с засыпанием процесса; + * - чтение страницы с диска; + * - обновление PTE и пробуждением процесса; + * - переключение контекста по доступности ЦПУ. + * + * Пытаемся минимизировать накладные расходы записывая страницу, что при + * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения + * с диска. При этом запись на диск должна быть отложена адекватным ядром, + * так как страница отображена в память в режиме чтения-записи и следом в + * неё пишет ЦПУ. */ + + /* В случае если страница в памяти процесса, то излишняя запись может быть + * достаточно дорогой. Кроме системного вызова и копирования данных, в особо + * одаренных ОС при этом могут включаться файловая система, выделяться + * временная страница, пополняться очереди асинхронного выполнения, + * обновляться PTE с последующей генерацией page-fault и чтением данных из + * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть + * сравним с избегаемым ненужным чтением. */ + if (env->me_prefault_write) { + void *const pattern = ptr_disp( + env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); + size_t file_offset = pgno2bytes(env, pgno); + if (likely(num == 1)) { + if (!mincore_probe(env, pgno)) { + osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + need_clean = false; + } + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + size_t n = 0, cleared = 0; + for (size_t i = 0; i < num; ++i) { + if (!mincore_probe(env, pgno + (pgno_t)i)) { + ++cleared; + iov[n].iov_len = env->me_psize; + iov[n].iov_base = pattern; + if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, + file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + n = 0; + } + } + } + if (likely(n > 0)) { + osal_pwritev(env->me_lazy_fd, iov, n, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + if (cleared == num) + need_clean = false; + } + } + } else { + ret.page = page_malloc(txn, num); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto bailout; + } + } + + if (unlikely(need_clean)) + memset(ret.page, -1, pgno2bytes(env, num)); + + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->mp_pages = (pgno_t)num; + ret.page->mp_flags = P_OVERFLOW; + } + + ret.err = page_dirty(txn, ret.page, (pgno_t)num); +bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; +#endif /* MDBX_ENABLE_PROFGC */ + return ret; +} + +static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, + uint8_t flags) { +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); #endif /* MDBX_ENABLE_PROFGC */ pgr_t ret; @@ -10524,377 +11269,412 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, prof->spe_counter += 1; #endif /* MDBX_ENABLE_PROFGC */ - eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); - eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | - MDBX_ALLOC_BACKLOG)) == 0 || - (flags & MDBX_ALLOC_GC)); - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | - MDBX_ALLOC_BACKLOG)) == 0 || - (flags & MDBX_ALLOC_NEW) == 0); + eASSERT(env, num > 0 || (flags & MDBX_ALLOC_RESERVE)); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno = 0, *range = nullptr; - size_t re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t pgno = 0; + size_t newnext; if (num > 1) { - eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); #if MDBX_ENABLE_PROFGC prof->xpages += 1; #endif /* MDBX_ENABLE_PROFGC */ - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) goto done; - } } } else { - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || - MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE) || num == 0); } //--------------------------------------------------------------------------- - if (likely(flags & MDBX_ALLOC_GC)) { - if (unlikely(!is_gc_usable(txn))) - goto no_gc; + if (unlikely(!is_gc_usable(txn, mc, flags))) { + eASSERT(env, txn->mt_flags & MDBX_TXN_DRAINED_GC); + goto no_gc; + } - eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); - flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | + MDBX_ALLOC_SHOULD_SCAN)) == 0); + flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; - const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; + if (/* Не коагулируем записи при подготовке резерва для обновления GC. + * Иначе попытка увеличить резерв может приводить к необходимости ещё + * большего резерва из-за увеличения списка переработанных страниц. */ + (flags & MDBX_ALLOC_RESERVE) == 0) { if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) + MDBX_PNL_GETSIZE(txn->tw.relist) < env->me_maxgc_ov1page / 2) flags += MDBX_ALLOC_COALESCE; + } - MDBX_cursor recur; - ret.err = gc_cursor_init(&recur, txn); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; + MDBX_cursor *const gc = ptr_disp(env->me_txn0, sizeof(MDBX_txn)); + eASSERT(env, mc != gc && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; - retry_gc_refresh_oldest:; - txnid_t oldest = txn_oldest_reader(txn); - if (unlikely(!oldest)) - goto no_gc; + env->me_prefault_write = env->me_options.prefault_write; + if (env->me_prefault_write) { + /* Проверка посредством minicore() существенно снижает затраты, но в + * простейших случаях (тривиальный бенчмарк) интегральная производительность + * становится вдвое меньше. А на платформах без mincore() и с проблемной + * подсистемой виртуальной памяти ситуация может быть многократно хуже. + * Поэтому избегаем затрат в ситуациях когда prefaukt-write скорее всего не + * нужна. */ + const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; + const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; + if (/* Не суетимся если GC почти пустая и БД маленькая */ + (txn->mt_dbs[FREE_DBI].md_branch_pages == 0 && + txn->mt_geo.now < 1234) || + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + (readahead_enabled && pgno + num < readahead_edge)) + env->me_prefault_write = false; + } - retry_gc_have_oldest: - if (unlikely(oldest >= txn->mt_txnid)) { - ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN - " for current-txnid %" PRIaTXN, - oldest, txn->mt_txnid); - ret.err = MDBX_PROBLEM; - goto fail; - } - const txnid_t detent = oldest + 1; +retry_gc_refresh_oldest:; + txnid_t oldest = txn_oldest_reader(txn); +retry_gc_have_oldest: + if (unlikely(oldest >= txn->mt_txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->mt_txnid); + ret.err = MDBX_PROBLEM; + goto fail; + } + const txnid_t detent = oldest + 1; - txnid_t last = 0; - bool should_scan = false; - MDBX_cursor_op op = MDBX_FIRST; - if (flags & MDBX_ALLOC_LIFO) { - if (!txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = txl_alloc(); - if (unlikely(!txn->tw.lifo_reclaimed)) { - ret.err = MDBX_ENOMEM; - goto fail; - } + txnid_t id = 0; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & MDBX_ALLOC_LIFO) { + if (!txn->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { + ret.err = MDBX_ENOMEM; + goto fail; } - /* Begin lookup backward from oldest reader */ - last = detent - 1; - op = MDBX_SET_RANGE; - } else if (txn->tw.last_reclaimed) { - /* Continue lookup forward from last-reclaimed */ - last = txn->tw.last_reclaimed + 1; - if (last >= detent) - goto no_gc; - op = MDBX_SET_RANGE; } + /* Begin lookup backward from oldest reader */ + id = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + id = txn->tw.last_reclaimed + 1; + if (id >= detent) + goto depleted_gc; + op = MDBX_SET_RANGE; + } - next_gc:; - MDBX_val key; - key.iov_base = &last; - key.iov_len = sizeof(last); +next_gc:; + MDBX_val key; + key.iov_base = &id; + key.iov_len = sizeof(id); #if MDBX_ENABLE_PROFGC - prof->rsteps += 1; + prof->rsteps += 1; #endif /* MDBX_ENABLE_PROFGC */ - /* Seek first/next GC record */ - ret.err = mdbx_cursor_get(&recur, &key, NULL, op); - if (unlikely(ret.err != MDBX_SUCCESS)) { - if (unlikely(ret.err != MDBX_NOTFOUND)) - goto fail; - if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { - op = MDBX_PREV; - goto next_gc; - } - goto depleted_gc; - } - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ret.err = MDBX_CORRUPTED; + /* Seek first/next GC record */ + ret.err = cursor_get(gc, &key, NULL, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; - } - last = unaligned_peek_u64(4, key.iov_base); - if (flags & MDBX_ALLOC_LIFO) { + if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { op = MDBX_PREV; - if (last >= detent) - goto next_gc; - /* skip IDs of records that already reclaimed */ - for (size_t i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) - if (txn->tw.lifo_reclaimed[i] == last) - goto next_gc; - } else { - op = MDBX_NEXT; - if (unlikely(last >= detent)) - goto depleted_gc; + goto next_gc; } + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + id = unaligned_peek_u64(4, key.iov_base); + if (flags & MDBX_ALLOC_LIFO) { + op = MDBX_PREV; + if (id >= detent || is_already_reclaimed(txn, id)) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(id >= detent)) + goto depleted_gc; + } + txn->mt_flags &= ~MDBX_TXN_DRAINED_GC; - /* Reading next GC record */ - MDBX_val data; - MDBX_page *const mp = recur.mc_pg[recur.mc_top]; - if (unlikely((ret.err = node_read(&recur, - page_node(mp, recur.mc_ki[recur.mc_top]), - &data, mp)) != MDBX_SUCCESS)) - goto fail; + /* Reading next GC record */ + MDBX_val data; + MDBX_page *const mp = gc->mc_pg[gc->mc_top]; + if (unlikely((ret.err = node_read(gc, page_node(mp, gc->mc_ki[gc->mc_top]), + &data, mp)) != MDBX_SUCCESS)) + goto fail; - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !pnl_check(gc_pnl, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; - goto fail; + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + + if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= + env->me_maxgc_ov1page)) { + /* Don't try to coalesce too much. */ + if (flags & MDBX_ALLOC_SHOULD_SCAN) { + eASSERT(env, flags & MDBX_ALLOC_COALESCE); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + eASSERT(env, num > 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + if (likely(num == 1)) { + pgno = relist_get_single(txn); + goto done; + } + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; } - const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( txn->tw.relist) >= env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && + ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + num) || gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", + /* Stop reclaiming to avoid large/overflow the page list. This is a rare + * case while search for a continuously multi-page region in a + * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ + NOTICE("stop reclaiming %s: %zu (current) + %zu " + "(chunk) -> %zu, rp_augment_limit %u", + likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < MDBX_PGL_LIMIT) + ? "since rp_augment_limit was reached" + : "to avoid PNL overflow", MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist), + env->me_options.rp_augment_limit); goto depleted_gc; } + } - /* Remember ID of readed GC record */ - txn->tw.last_reclaimed = last; - if (flags & MDBX_ALLOC_LIFO) { - ret.err = txl_append(&txn->tw.lifo_reclaimed, last); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } - - /* Append PNL from GC record to tw.relist */ - ret.err = pnl_need(&txn->tw.relist, gc_len); + /* Remember ID of readed GC record */ + txn->tw.last_reclaimed = id; + if (flags & MDBX_ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.lifo_reclaimed, id); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; - txn->tw.relist = txn->tw.relist; + } - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO - " len %zu, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (size_t i = gc_len; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); - DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + id, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + } + + /* Merge in descending sorted order */ + pnl_merge(txn->tw.relist, gc_pnl); + flags |= MDBX_ALLOC_SHOULD_SCAN; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; } + } else { + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); + } + eASSERT(env, dirtylist_check(txn)); - /* Merge in descending sorted order */ - re_len = pnl_merge(txn->tw.relist, gc_pnl); - should_scan = true; - if (AUDIT_ENABLED()) { - if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - } else { - eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); - } - eASSERT(env, dirtylist_check(txn)); + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + txn_refund(txn); + } + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - eASSERT(env, - re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && - unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - if (txn_refund(txn)) - re_len = MDBX_PNL_GETSIZE(txn->tw.relist); - } - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(num == 0)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto early_exit; + } - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { - eASSERT(env, ret.err == MDBX_SUCCESS); - goto early_exit; - } + /* TODO: delete reclaimed records */ - /* TODO: delete reclaimed records */ + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & MDBX_ALLOC_COALESCE) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; + } - /* Don't try to coalesce too much. */ - eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); - if (flags & MDBX_ALLOC_COALESCE) { - if (re_len /* current size */ < coalesce_threshold) { -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; -#endif /* MDBX_ENABLE_PROFGC */ - goto next_gc; - } - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; - } - - scan: - eASSERT(env, should_scan); - if (re_len >= num) { - eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && - MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; - goto done; - } - } - should_scan = false; - if (ret.err == MDBX_SUCCESS) - goto next_gc; - - depleted_gc: - ret.err = MDBX_NOTFOUND; - if (should_scan) - goto scan; - - //------------------------------------------------------------------------- - - /* There is no suitable pages in the GC and to be able to allocate - * we should CHOICE one of: - * - make a new steady checkpoint if reclaiming was stopped by - * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; - * - kick lagging reader(s) if reclaiming was stopped by ones of it. - * - extend the database file. */ - - /* Will use new pages from the map if nothing is suitable in the GC. */ - pgno = txn->mt_next_pgno; - const size_t newnext = num + pgno; - - const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); - /* does reclaiming stopped at the last steady point? */ - if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && - detent == prefer_steady.txnid + 1) { - DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN - "-%s, detent %" PRIaTXN, - recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, - durable_caption(prefer_steady.ptr_c), detent); - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - newnext >= prefer_steady.ptr_c->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - ret.err = wipe_steady(txn, detent); - DEBUG("gc-wipe-steady, rc %d", ret.err); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; - } - if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || - (autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= - autosync_threshold) || - (autosync_period && - (eoos_timestamp = - atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period) || - newnext >= txn->mt_geo.upper || - (newnext >= txn->mt_end_pgno && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - MDBX_meta meta = *recent.ptr_c; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.troika); - DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, ret.err != MDBX_RESULT_TRUE); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; - } - } - - if (env->me_lck_mmap.lck && - unlikely(true == - atomic_load32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, - mo_AcquireRelease))) { - oldest = txn_oldest_reader(txn); - if (oldest >= detent) - goto retry_gc_have_oldest; - } - - /* avoid kick lagging reader(s) if is enough unallocated space - * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { - eASSERT(env, range == nullptr); +scan: + eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); + eASSERT(env, num > 0); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + if (likely(num == 1)) { + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + pgno = relist_get_single(txn); goto done; } + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + flags -= MDBX_ALLOC_SHOULD_SCAN; + if (ret.err == MDBX_SUCCESS) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; + } - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { - oldest = kick_longlived_readers(env, oldest); - if (oldest >= detent) - goto retry_gc_have_oldest; +depleted_gc: + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + ret.err = MDBX_NOTFOUND; + if (flags & MDBX_ALLOC_SHOULD_SCAN) + goto scan; + txn->mt_flags |= MDBX_TXN_DRAINED_GC; + + //------------------------------------------------------------------------- + + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ + + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = txn->mt_next_pgno + num; + + /* Does reclaiming stopped at the last steady point? */ + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; } + if ((autosync_threshold && + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->mt_geo.upper || + ((num == 0 || newnext >= txn->mt_end_pgno) && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + } + + if (unlikely(true == atomic_load32(&env->me_lck->mti_readers_refresh_flag, + mo_AcquireRelease))) { + oldest = txn_oldest_reader(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + /* Avoid kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { + eASSERT(env, pgno == 0); + goto done; + } + + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { + oldest = kick_longlived_readers(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; } //--------------------------------------------------------------------------- no_gc: - if ((flags & MDBX_ALLOC_NEW) == 0) { + eASSERT(env, pgno == 0); +#ifndef MDBX_ENABLE_BACKLOG_DEPLETED +#define MDBX_ENABLE_BACKLOG_DEPLETED 0 +#endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/ + if (MDBX_ENABLE_BACKLOG_DEPLETED && + unlikely(!(txn->mt_flags & MDBX_TXN_DRAINED_GC))) { + ret.err = MDBX_BACKLOG_DEPLETED; + goto fail; + } + if (flags & MDBX_ALLOC_RESERVE) { ret.err = MDBX_NOTFOUND; goto fail; } /* Will use new pages from the map if nothing is suitable in the GC. */ - pgno = txn->mt_next_pgno; - const size_t newnext = num + pgno; + newnext = txn->mt_next_pgno + num; if (newnext <= txn->mt_end_pgno) goto done; @@ -10913,79 +11693,35 @@ no_gc: aligned = txn->mt_geo.upper; eASSERT(env, aligned >= newnext); -#if MDBX_ENABLE_PROFGC - monotime_shot = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, aligned - txn->mt_end_pgno); - ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, - txn->mt_geo.upper); + ret.err = dxb_resize(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper, implicit_grow); if (ret.err != MDBX_SUCCESS) { ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, aligned - txn->mt_end_pgno, ret.err); goto fail; } env->me_txn->mt_end_pgno = (pgno_t)aligned; + eASSERT(env, pgno == 0); //--------------------------------------------------------------------------- done: ret.err = MDBX_SUCCESS; - if (likely((flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) == 0)) { - ENSURE(env, pgno >= NUM_METAS); - if (range) { - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); - eASSERT(env, pgno == *range); + if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { + if (pgno) { eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); - /* Cutoff allocated pages from tw.relist */ -#if MDBX_PNL_ASCENDING - for (const pgno_t *const end = re_list + re_len - num; range <= end; - ++range) - *range = range[num]; -#else - for (const pgno_t *const end = txn->tw.relist + re_len; ++range <= end;) - range[-(ptrdiff_t)num] = *range; -#endif - MDBX_PNL_SETSIZE(txn->tw.relist, re_len -= num); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { - eASSERT(env, flags & MDBX_ALLOC_NEW); pgno = txn->mt_next_pgno; txn->mt_next_pgno += (pgno_t)num; eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); } -#if MDBX_ENABLE_PROFGC - if (!monotime_shot) - monotime_shot = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); - } else { - ret.page = page_malloc(txn, num); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = (pgno_t)num; - ret.page->mp_flags = P_OVERFLOW; - } - - ret.err = page_dirty(txn, ret.page, (pgno_t)num); + ret = page_alloc_finalize(env, txn, mc, pgno, num); if (unlikely(ret.err != MDBX_SUCCESS)) { fail: eASSERT(env, ret.err != MDBX_SUCCESS); @@ -10994,8 +11730,9 @@ done: int level; const char *what; if (flags & MDBX_ALLOC_RESERVE) { - level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; + level = + (flags & MDBX_ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = num ? "reserve-pages" : "fetch-slot"; } else { txn->mt_flags |= MDBX_TXN_ERROR; level = MDBX_LOG_ERROR; @@ -11003,35 +11740,35 @@ done: } if (LOG_ENABLED(level)) debug_log(level, __func__, __LINE__, - "unable alloc %zu %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); + "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " + "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " + "branch %zu, leaf %zu, large %zu, entries %zu\n", + num, what, flags, ret.err, txn->mt_flags, + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, + txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); ret.page = NULL; } } else { early_exit: DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, - (flags & MDBX_ALLOC_SLOT) ? "SLOT" : "RESERVE", ret.err); + num ? "RESERVE" : "SLOT", ret.err); ret.page = NULL; } - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); #if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += majflt_after - majflt_before; - const uint64_t monotime_now = osal_monotime(); - if (monotime_shot) { - prof->xtime_monotonic += monotime_shot - monotime_before; - prof->rtime_monotonic += monotime_now - monotime_shot; - } else - prof->rtime_monotonic += monotime_now - monotime_before; + prof->rtime_monotonic += osal_monotime() - monotime_before; #endif /* MDBX_ENABLE_PROFGC */ return ret; } -__hot static pgr_t page_alloc(const MDBX_cursor *mc) { +__hot static pgr_t page_alloc(const MDBX_cursor *const mc) { MDBX_txn *const txn = mc->mc_txn; + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(txn->mt_dbistate[mc->mc_dbi], DBI_DIRTY | DBI_VALID)); /* If there are any loose pages, just use them */ while (likely(txn->tw.loose_pages)) { @@ -11043,97 +11780,67 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { } #endif /* MDBX_ENABLE_REFUND */ - MDBX_page *page = txn->tw.loose_pages; - txn->tw.loose_pages = page->mp_next; + MDBX_page *lp = txn->tw.loose_pages; + MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->mt_env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; - DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno); - tASSERT(txn, page->mp_pgno < txn->mt_next_pgno); - tASSERT(txn, page->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); - page->mp_txnid = txn->mt_front; - pgr_t ret = {page, MDBX_SUCCESS}; + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), lp->mp_pgno); + tASSERT(txn, lp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, lp->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->mt_env)); + lp->mp_txnid = txn->mt_front; + pgr_t ret = {lp, MDBX_SUCCESS}; return ret; } - if (likely(!(txn->mt_flags & MDBX_TXN_FROZEN_RE))) { - MDBX_PNL pnl = txn->tw.relist; - const size_t len = MDBX_PNL_GETSIZE(pnl); - if (likely(len > 0)) { - MDBX_env *const env = txn->mt_env; + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) + return page_alloc_finalize(txn->mt_env, txn, mc, relist_get_single(txn), 1); - MDBX_PNL_SETSIZE(pnl, len - 1); -#if MDBX_PNL_ASCENDING - const pgno_t pgno = pnl[1]; - for (size_t i = 1; i < len; ++i) - pnl[i] = pnl[i + 1]; -#else - const pgno_t pgno = pnl[len]; -#endif - -#if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); - size_t majflt_before; - const uint64_t cputime_before = osal_cputime(&majflt_before); - profgc_stat_t *const prof = - (mc->mc_dbi == FREE_DBI) ? &env->me_lck->mti_pgop_stat.gc_prof.self - : &env->me_lck->mti_pgop_stat.gc_prof.work; -#endif /* MDBX_ENABLE_PROFGC */ - pgr_t ret; - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); - } else { - ret.page = page_malloc(txn, 1); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto bailout; - } - } - - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); - - ret.err = page_dirty(txn, ret.page, 1); - bailout: - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); -#if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += majflt_after - majflt_before; - prof->xtime_monotonic += osal_monotime() - monotime_before; -#endif /* MDBX_ENABLE_PROFGC */ - return ret; - } - } - - return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); + return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); } -/* Copy the used portions of a non-large/overflow page. */ -__hot static void page_copy(MDBX_page *dst, const MDBX_page *src, - size_t psize) { +/* Copy the used portions of a page. */ +__hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, + const size_t size) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + void *copy_dst = dst; + const void *copy_src = src; + size_t copy_len = size; + if (src->mp_flags & P_LEAF2) { + copy_len = PAGEHDRSZ + src->mp_leaf2_ksize * page_numkeys(src); + if (unlikely(copy_len > size)) + goto bailout; + } if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { - size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; - + size_t upper = src->mp_upper, lower = src->mp_lower; + intptr_t unused = upper - lower; /* If page isn't full, just copy the used portion. Adjust * alignment so memcpy may copy words instead of bytes. */ - if (unused >= MDBX_CACHELINE_SIZE * 2) { + if (unused > MDBX_CACHELINE_SIZE * 3) { lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); - memcpy(dst, src, lower); - dst = (void *)((char *)dst + upper); - src = (void *)((char *)src + upper); - psize -= upper; + if (unlikely(upper > copy_len)) + goto bailout; + memcpy(copy_dst, copy_src, lower); + copy_dst = ptr_disp(copy_dst, upper); + copy_src = ptr_disp(copy_src, upper); + copy_len -= upper; } } - memcpy(dst, src, psize); + memcpy(copy_dst, copy_src, copy_len); + return; + +bailout: + if (src->mp_flags & P_LEAF2) + bad_page(src, "%s addr %p, n-keys %zu, ksize %u", + "invalid/corrupted source page", __Wpedantic_format_voidptr(src), + page_numkeys(src), src->mp_leaf2_ksize); + else + bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", + __Wpedantic_format_voidptr(src), src->mp_upper); + memset(dst, -1, size); } /* Pull a page off the txn's spill list, if present. @@ -11200,6 +11907,9 @@ __hot static int page_touch(MDBX_cursor *mc) { MDBX_txn *txn = mc->mc_txn; int rc; + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(*mc->mc_dbistate, DBI_DIRTY | DBI_VALID)); + tASSERT(txn, !IS_OVERFLOW(mp)); if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); @@ -11207,16 +11917,46 @@ __hot static int page_touch(MDBX_cursor *mc) { tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); - } else { - tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY); } - tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, !IS_OVERFLOW(mp)); tASSERT(txn, dirtylist_check(txn)); } - if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) + if (IS_MODIFIABLE(txn, mp)) { + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) + return MDBX_SUCCESS; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const size_t n = dpl_search(txn, mp->mp_pgno); + if (MDBX_AVOID_MSYNC && + unlikely(txn->tw.dirtylist->items[n].pgno != mp->mp_pgno)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + np = (MDBX_page *)mp; +#if MDBX_ENABLE_PGOP_STAT + txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + return page_dirty(txn, np, 1); + } + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); + tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && + txn->tw.dirtylist->items[n].ptr == mp); + if (!MDBX_AVOID_MSYNC || (txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = + ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) { + np = (MDBX_page *)mp; + np->mp_txnid = txn->mt_front; + return MDBX_SUCCESS; + } + tASSERT(txn, !IS_OVERFLOW(mp)); if (IS_FROZEN(txn, mp)) { /* CoW the page */ @@ -11344,7 +12084,7 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_page *page = data_page(head.ptr_c); rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); + ptr_dist(page, env->me_map)); if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -11398,6 +12138,15 @@ retry:; goto bailout; } + if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && + unlikely(head.ptr_c->mm_geo.next > + bytes2pgno(env, env->me_dxb_mmap.current))) { + rc = dxb_resize(env, head.ptr_c->mm_geo.next, head.ptr_c->mm_geo.now, + head.ptr_c->mm_geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + const size_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = @@ -11530,17 +12279,11 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return env_sync(env, force, nonblock); } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); } - -__cold int mdbx_env_sync_poll(MDBX_env *env) { - return __inline_mdbx_env_sync_poll(env); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - /* Back up parent txn's cursors, then grab the originals for tracking */ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { - for (int i = parent->mt_numdbs; --i >= 0;) { + tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); + nested->mt_cursors[FREE_DBI] = nullptr; + for (int i = parent->mt_numdbs; --i > FREE_DBI;) { nested->mt_cursors[i] = NULL; MDBX_cursor *mc = parent->mt_cursors[i]; if (mc != NULL) { @@ -11585,23 +12328,26 @@ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * * Returns 0 on success, non-zero on failure. */ static void cursors_eot(MDBX_txn *txn, const bool merge) { - for (intptr_t i = txn->mt_numdbs; --i >= 0;) { - MDBX_cursor *next, *mc = txn->mt_cursors[i]; + tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); + for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { + MDBX_cursor *mc = txn->mt_cursors[i]; if (!mc) continue; - txn->mt_cursors[i] = NULL; + txn->mt_cursors[i] = nullptr; do { const unsigned stage = mc->mc_signature; - MDBX_cursor *bk = mc->mc_backup; - next = mc->mc_next; + MDBX_cursor *const next = mc->mc_next; + MDBX_cursor *const bk = mc->mc_backup; ENSURE(txn->mt_env, stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; - cASSERT(mc, mx == bk->mc_xcursor); tASSERT(txn, txn->mt_parent != NULL); + /* Zap: Using uninitialized memory '*mc->mc_backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, mx == bk->mc_xcursor); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { @@ -11631,7 +12377,8 @@ static void cursors_eot(MDBX_txn *txn, const bool merge) { mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } - } while ((mc = next) != NULL); + mc = next; + } while (mc); } } @@ -11703,10 +12450,11 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { if (edge > last) { eASSERT(env, last >= NUM_METAS); env->me_poison_edge = last; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, pgno2bytes(env, last)), pgno2bytes(env, edge - last)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), - pgno2bytes(env, edge - last)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, last)), + pgno2bytes(env, edge - last)); } if (should_unlock) mdbx_txn_unlock(env); @@ -11855,7 +12603,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_SUCCESS; } -/* check against https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ +/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_db *dbs, const volatile MDBX_meta *meta, bool report) { @@ -11879,20 +12627,26 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN - " for meta_txnid %" PRIaTXN " %s", - "free", freedb_mod_txnid, txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING( + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + " %s", + "free", freedb_mod_txnid, txnid, + (env->me_stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); ok = false; } if (unlikely(txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN - " for meta_txnid %" PRIaTXN " %s", - "main", maindb_mod_txnid, txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING( + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + " %s", + "main", maindb_mod_txnid, txnid, + (env->me_stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); ok = false; } if (likely(freedb_root && freedb_mod_txnid)) { @@ -11902,11 +12656,12 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const txnid_t root_txnid = freedb_root->mp_txnid; if (unlikely(root_txnid != freedb_mod_txnid)) { if (report) - WARNING( - "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN - " for %sdb.mod_txnid %" PRIaTXN " %s", - freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, + (env->me_stuck_meta < 0) ? "(workaround for incoherent flaw of " + "unified page/buffer cache)" + : "(wagering meta)"); ok = false; } } @@ -11917,26 +12672,33 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const txnid_t root_txnid = maindb_root->mp_txnid; if (unlikely(root_txnid != maindb_mod_txnid)) { if (report) - WARNING( - "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN - " for %sdb.mod_txnid %" PRIaTXN " %s", - maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, + (env->me_stuck_meta < 0) ? "(workaround for incoherent flaw of " + "unified page/buffer cache)" + : "(wagering meta)"); ok = false; } } + if (unlikely(!ok) && report) + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; return ok; } -__cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { +__cold static int coherency_timeout(uint64_t *timestamp, intptr_t pgno, + const MDBX_env *env) { if (likely(timestamp && *timestamp == 0)) *timestamp = osal_monotime(); else if (unlikely(!timestamp || osal_monotime() - *timestamp > osal_16dot16_to_monotime(65536 / 10))) { - if (pgno) - ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, + if (pgno >= 0 && pgno != env->me_stuck_meta) + ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); - else + else if (env->me_stuck_meta < 0) ERROR("bailout waiting for valid snapshot (%s)", "workaround for incoherent flaw of unified page/buffer cache"); return MDBX_PROBLEM; @@ -11956,39 +12718,48 @@ __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { } /* check with timeout as the workaround - * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ -__hot static int coherency_check_readed(const MDBX_env *env, - const txnid_t txnid, - const volatile MDBX_db *dbs, - const volatile MDBX_meta *meta, - uint64_t *timestamp) { - const bool report = !(timestamp && *timestamp); - if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) - return coherency_timeout(timestamp, 0); + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ +__hot static int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, + uint64_t *timestamp) { + /* Copy the DB info and flags */ + txn->mt_geo = head.ptr_v->mm_geo; + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = head.ptr_v->mm_canary; + + if (unlikely(!coherency_check(txn->mt_env, head.txnid, txn->mt_dbs, + head.ptr_v, *timestamp == 0))) + return coherency_timeout(timestamp, -1, txn->mt_env); return MDBX_SUCCESS; } static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_meta *meta, - uint64_t *timestamp) { + const intptr_t pgno, uint64_t *timestamp) { const bool report = !(timestamp && *timestamp); const txnid_t head_txnid = meta_txnid(meta); - if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { - if (report) + if (unlikely(head_txnid < MIN_TXNID || head_txnid < txnid)) { + if (report) { + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, - bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), + bytes2pgno(env, ptr_dist(meta, env->me_map)), "(workaround for incoherent flaw of unified page/buffer cache)"); - return coherency_timeout(timestamp, 0); + } + return coherency_timeout(timestamp, pgno, env); } - return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); + if (unlikely(!coherency_check(env, head_txnid, meta->mm_dbs, meta, report))) + return coherency_timeout(timestamp, pgno, env); + return MDBX_SUCCESS; } -static bool coherency_check_meta(const MDBX_env *env, +static bool check_meta_coherency(const MDBX_env *env, const volatile MDBX_meta *meta, bool report) { uint64_t timestamp = 0; - return coherency_check_written(env, 0, meta, report ? ×tamp : nullptr) == - MDBX_SUCCESS; + return coherency_check_written(env, 0, meta, -1, + report ? ×tamp : nullptr) == MDBX_SUCCESS; } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ @@ -12100,13 +12871,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Snap the state from current meta-head */ txn->mt_txnid = head.txnid; - txn->mt_geo = head.ptr_v->mm_geo; - memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - txn->mt_canary = head.ptr_v->mm_canary; - - if (unlikely(env->me_stuck_meta >= 0)) - break; - if (unlikely(meta_should_retry(env, &troika) || + if (likely(env->me_stuck_meta < 0) && + unlikely(meta_should_retry(env, &troika) || head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease))) { if (unlikely(++loop > 42)) { @@ -12122,8 +12888,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { continue; } - rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v, - ×tamp); + rc = coherency_check_head(txn, head, ×tamp); jitter4testing(false); if (likely(rc == MDBX_SUCCESS)) break; @@ -12192,16 +12957,13 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->tw.troika = meta_tap(env); const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; - while ( - "workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { - rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, - head.ptr_v, ×tamp); + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_head(txn, head, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - txn->mt_canary = head.ptr_c->mm_canary; eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); txn->mt_txnid = safe64_txnid_next(head.txnid); if (unlikely(txn->mt_txnid > MAX_TXNID)) { @@ -12218,31 +12980,29 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); - txn->tw.spill_pages = NULL; - txn->tw.spill_least_removed = 0; + txn->tw.spilled.list = NULL; + txn->tw.spilled.least_removed = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); env->me_txn = txn; txn->mt_numdbs = env->me_numdbs; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - /* Moved to here to avoid a data race in read TXNs */ - txn->mt_geo = head.ptr_c->mm_geo; if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { rc = dpl_alloc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0; } else { tASSERT(txn, txn->tw.dirtylist == nullptr); txn->tw.dirtylist = nullptr; txn->tw.dirtyroom = MAX_PAGENO; txn->tw.dirtylru = 0; } + eASSERT(env, txn->tw.writemap_dirty_npages == 0); + eASSERT(env, txn->tw.writemap_spilled_npages == 0); } /* Setup db info */ @@ -12255,6 +13015,10 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; } txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; + rc = + setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); @@ -12263,39 +13027,98 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { - const size_t size = - pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno - : txn->mt_end_pgno); - if (unlikely(size > env->me_dxb_mmap.limit)) { + const size_t size_bytes = pgno2bytes(env, txn->mt_end_pgno); + const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); + const size_t required_bytes = + (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + if (unlikely(required_bytes > env->me_dxb_mmap.current)) { + /* Размер БД (для пишущих транзакций) или используемых данных (для + * читающих транзакций) больше предыдущего/текущего размера внутри + * процесса, увеличиваем. Сюда также попадает случай увеличения верхней + * границы размера БД и отображения. В читающих транзакциях нельзя + * изменять размер файла, который может быть больше необходимого этой + * транзакции. */ if (txn->mt_geo.upper > MAX_PAGENO + 1 || bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != txn->mt_geo.upper) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, - (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); - if (rc != MDBX_SUCCESS) + rc = dxb_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { + /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно + * уменьшить, но всё сложнее: + * - размер файла согласован со всеми читаемыми снимками на момент + * коммита последней транзакции; + * - в читающей транзакции размер файла может быть больше и него нельзя + * изменять, в том числе менять madvise (меньша размера файла нельзя, + * а за размером нет смысла). + * - в пишущей транзакции уменьшать размер файла можно только после + * проверки размера читаемых снимков, но в этом нет смысла, так как + * это будет сделано при фиксации транзакции. + * + * В сухом остатке, можно только установить dxb_mmap.current равным + * размеру файла, а это проще сделать без вызова dxb_resize() и усложения + * внутренней логики. + * + * В этой тактике есть недостаток: если пишущите транзакции не регулярны, + * и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за + * читающих транзакций использующих предыдущие снимки. */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireShared(&env->me_remap_guard); +#else + rc = osal_fastmutex_acquire(&env->me_remap_guard); +#endif + if (likely(rc == MDBX_SUCCESS)) { + rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); + if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); + if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) + env->me_dxb_mmap.current = (size_t)env->me_dxb_mmap.filesize; + } +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_ReleaseShared(&env->me_remap_guard); +#else + int err = osal_fastmutex_release(&env->me_remap_guard); + if (unlikely(err) && likely(rc == MDBX_SUCCESS)) + rc = err; +#endif + } + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } else { - env->me_dxb_mmap.current = size; - env->me_dxb_mmap.filesize = - (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } + eASSERT(env, + pgno2bytes(env, txn->mt_next_pgno) <= env->me_dxb_mmap.current); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || + if (((used_bytes > env->me_dbgeo.lower && env->me_dbgeo.shrink) || (mdbx_RunningUnderWine() && /* under Wine acquisition of remap_guard is always required, * since Wine don't support section extending, * i.e. in both cases unmap+map are required. */ - size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && + used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ + } else { + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + + tASSERT(txn, txn == env->me_txn0); + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + rc = cursor_init(gc, txn, FREE_DBI); + if (rc != MDBX_SUCCESS) + goto bailout; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, txn); @@ -12377,13 +13200,6 @@ int mdbx_txn_renew(MDBX_txn *txn) { return rc; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, - MDBX_txn **ret) { - return __inline_mdbx_txn_begin(env, parent, flags, ret); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) @@ -12399,9 +13215,6 @@ void *mdbx_txn_get_userctx(const MDBX_txn *txn) { int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { - MDBX_txn *txn; - size_t size, tsize; - if (unlikely(!ret)) return MDBX_EINVAL; *ret = NULL; @@ -12420,6 +13233,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, flags |= env->me_flags & MDBX_WRITEMAP; + MDBX_txn *txn = nullptr; if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ rc = check_txn_rw(parent, @@ -12450,9 +13264,13 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, goto renew; } - size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); - size += tsize = sizeof(MDBX_txn); - if (unlikely((txn = osal_malloc(size)) == NULL)) { + const size_t base = (flags & MDBX_TXN_RDONLY) + ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) + : sizeof(MDBX_txn); + const size_t size = + base + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + txn = osal_malloc(size); + if (unlikely(txn == nullptr)) { DEBUG("calloc: %s", "failed"); return MDBX_ENOMEM; } @@ -12460,11 +13278,16 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memset(txn, 0xCD, size); VALGRIND_MAKE_MEM_UNDEFINED(txn, size); #endif /* MDBX_DEBUG */ - memset(txn, 0, tsize); + MDBX_ANALYSIS_ASSUME(size > base); + memset(txn, 0, + (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); + txn->mt_dbs = ptr_disp(txn, base); + txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); +#if MDBX_DEBUG + txn->mt_cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ +#endif /* MDBX_DEBUG */ + txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs; txn->mt_flags = flags; txn->mt_env = env; @@ -12493,15 +13316,15 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const size_t di = dpl_exist(parent, lp->mp_pgno); - tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); tASSERT(parent, lp->mp_flags == P_LOOSE); rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; - parent->tw.loose_pages = lp->mp_next; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + parent->tw.loose_pages = mp_next(lp); /* Remove from dirty list */ - page_wash(parent, di, lp, 1); + page_wash(parent, dpl_exist(parent, lp->mp_pgno), lp, 1); } while (parent->tw.loose_pages); parent->tw.loose_count = 0; #if MDBX_ENABLE_REFUND @@ -12513,7 +13336,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->tw.dirtylru = parent->tw.dirtylru; dpl_sort(parent); - if (parent->tw.spill_pages) + if (parent->tw.spilled.list) spill_purge(parent); tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= @@ -12590,7 +13413,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, eASSERT(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); - assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); + assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; @@ -12695,10 +13518,10 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { env, txn->mt_child ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); - info->txn_space_dirty = - txn->tw.dirtylist - ? pgno2bytes(env, txn->tw.dirtylist->pages_including_loose) - : 0; + info->txn_space_dirty = pgno2bytes( + env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : (txn->tw.writemap_dirty_npages + + txn->tw.writemap_spilled_npages)); info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { @@ -12842,14 +13665,18 @@ static void dbi_update(MDBX_txn *txn, int keep) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; } else { - char *ptr = env->me_dbxs[i].md_name.iov_base; - if (ptr) { - env->me_dbxs[i].md_name.iov_len = 0; + const MDBX_val name = env->me_dbxs[i].md_name; + if (name.iov_base) { + env->me_dbxs[i].md_name.iov_base = nullptr; eASSERT(env, env->me_dbflags[i] == 0); atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), mo_AcquireRelease); - env->me_dbxs[i].md_name.iov_base = NULL; - osal_free(ptr); + env->me_dbxs[i].md_name.iov_len = 0; + if (name.iov_len) + osal_free(name.iov_base); + } else { + eASSERT(env, name.iov_len == 0); + eASSERT(env, env->me_dbflags[i] == 0); } } } @@ -12908,10 +13735,8 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { remove_dl: npages = dpl_npages(dl, r); dl->pages_including_loose -= npages; - if (!MDBX_AVOID_MSYNC || !(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, dl->items[r].ptr, npages); - } ++r; next_i: i += step; @@ -13014,8 +13839,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; - pnl_free(txn->tw.spill_pages); - txn->tw.spill_pages = nullptr; + pnl_free(txn->tw.spilled.list); + txn->tw.spilled.list = nullptr; if (txn == env->me_txn0) { eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ @@ -13065,8 +13890,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { /* undo resize performed by child txn */ - rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now, - parent->mt_geo.upper); + rc = dxb_resize(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper, impilict_shrink); if (rc == MDBX_EPERM) { /* unable undo resize (it is regular for Windows), * therefore promote size changes from child to the parent txn */ @@ -13083,7 +13908,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { "the parent", rc); parent->mt_flags |= MDBX_TXN_ERROR; - if (!env->me_dxb_mmap.address) + if (!env->me_dxb_mmap.base) env->me_flags |= MDBX_FATAL_ERROR; } } @@ -13166,7 +13991,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, size_t gc = 0; MDBX_val key, data; - while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { + while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { if (unlikely(key.iov_len != sizeof(txnid_t))) return MDBX_CORRUPTED; @@ -13197,8 +14022,9 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, txn->mt_dbistate[i] |= DBI_AUDITED; if (txn->mt_dbs[i].md_root == P_INVALID) continue; - used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; + used += (size_t)txn->mt_dbs[i].md_branch_pages + + (size_t)txn->mt_dbs[i].md_leaf_pages + + (size_t)txn->mt_dbs[i].md_overflow_pages; if (i != MAIN_DBI) continue; @@ -13215,7 +14041,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { if ((txn->mt_dbistate[k] & DBI_VALID) && - /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ + /* txn->mt_dbxs[k].md_name.iov_base && */ node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, node_ks(node)) == 0) { @@ -13226,8 +14052,8 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, } } } - used += - db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + used += (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + + (size_t)db->md_overflow_pages; } } rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); @@ -13241,11 +14067,13 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, continue; for (MDBX_txn *t = txn; t; t = t->mt_parent) if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + - t->mt_dbs[i].md_overflow_pages; + used += (size_t)t->mt_dbs[i].md_branch_pages + + (size_t)t->mt_dbs[i].md_leaf_pages + + (size_t)t->mt_dbs[i].md_overflow_pages; txn->mt_dbistate[i] |= DBI_AUDITED; break; } + MDBX_ANALYSIS_ASSUME(txn != nullptr); if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { WARNING("audit %s@%" PRIaTXN ": unable account dbi %zd / \"%*s\", state 0x%02x", @@ -13282,7 +14110,7 @@ typedef struct gc_update_context { #if MDBX_ENABLE_BIGFOOT txnid_t bigfoot; #endif /* MDBX_ENABLE_BIGFOOT */ - MDBX_cursor_couple cursor; + MDBX_cursor cursor; } gcu_context_t; static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { @@ -13291,7 +14119,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { #if MDBX_ENABLE_BIGFOOT ctx->bigfoot = txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ - return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); + return cursor_init(&ctx->cursor, txn, FREE_DBI); } static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { @@ -13300,7 +14128,13 @@ static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { int err = MDBX_SUCCESS; - if (ctx->retired_stored) + if (ctx->retired_stored) { + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; + gc->mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = gc; do { MDBX_val key, val; #if MDBX_ENABLE_BIGFOOT @@ -13309,11 +14143,10 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { key.iov_base = &txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); - const struct cursor_set_result csr = - cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; - err = mdbx_cursor_del(&ctx->cursor.outer, 0); + err = cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } @@ -13323,72 +14156,95 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { #else while (0); #endif /* MDBX_ENABLE_BIGFOOT */ + txn->mt_cursors[FREE_DBI] = gc->mc_next; + gc->mc_next = nullptr; + } + return err; +} + +static int gcu_touch(gcu_context_t *ctx) { + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.mc_txn->tw.retired_pages); + ctx->cursor.mc_flags |= C_GCU; + int err = cursor_touch(&ctx->cursor, &key, &val); + ctx->cursor.mc_flags -= C_GCU; return err; } /* Prepare a backlog of pages to modify GC itself, while reclaiming is * prohibited. It should be enough to prevent search in page_alloc_slowpath() * during a deleting, when GC tree is unbalanced. */ -static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, - const bool reserve4retired) { - const size_t pages4retiredlist = - reserve4retired ? number_of_ovpages( - txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) - : 0; - const size_t backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; - const size_t backlog4rebalance = backlog4cow + 1; +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { + const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; + const size_t for_rebalance = for_cow + 1 + + (txn->mt_dbs[FREE_DBI].md_depth + 1ul >= + txn->mt_dbs[FREE_DBI].md_branch_pages); + size_t for_split = ctx->retired_stored == 0; - if (likely(pages4retiredlist < 2 && - gcu_backlog_size(txn) > (reserve4retired - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) + const intptr_t retired_left = + MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; + size_t for_relist = 0; + if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { + for_relist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / + txn->mt_env->me_maxgc_ov1page; + const size_t per_branch_page = txn->mt_env->me_maxgc_per_branch; + for (size_t entries = for_relist; entries > 1; for_split += entries) + entries = (entries + per_branch_page - 1) / per_branch_page; + } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { + for_relist = + number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + } + + const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; + const size_t for_tree_after_touch = for_rebalance + for_split; + const size_t for_all_before_touch = for_relist + for_tree_before_touch; + const size_t for_all_after_touch = for_relist + for_tree_after_touch; + + if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch)) return MDBX_SUCCESS; - TRACE( - ">> reserve4retired %c, backlog %zu, 4list %zu, 4cow %zu, 4rebalance %zu", - reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, - backlog4cow, backlog4rebalance); + TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " + "4split %zu, " + "4cow %zu, 4tree %zu)", + ctx->retired_stored, retired_left, gcu_backlog_size(txn), + for_all_before_touch, for_relist, for_split, for_cow, + for_tree_before_touch); - int err; - if (unlikely(pages4retiredlist > 2)) { - MDBX_val key, val; - key.iov_base = val.iov_base = nullptr; - key.iov_len = sizeof(txnid_t); - val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - err = cursor_spill(&ctx->cursor.outer, &key, &val); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - - tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); - txn->mt_flags -= MDBX_TXN_UPDATE_GC; - err = cursor_touch(&ctx->cursor.outer); + int err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); - if (unlikely(pages4retiredlist > 1) && + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { - tASSERT(txn, reserve4retired); - err = gcu_clean_stored_retired(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = page_alloc_slowpath(&ctx->cursor.outer, (pgno_t)pages4retiredlist, - MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) - .err; + if (unlikely(ctx->retired_stored)) { + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (!ctx->retired_stored) + return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); + } + err = page_alloc_slowpath(&ctx->cursor, for_relist, MDBX_ALLOC_RESERVE).err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); - cASSERT(&ctx->cursor.outer, - gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); + cASSERT(&ctx->cursor, + gcu_backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); } - while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && - err == MDBX_SUCCESS) - err = page_alloc_slowpath(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) + while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) + err = page_alloc_slowpath(&ctx->cursor, 0, + MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) .err; - txn->mt_flags += MDBX_TXN_UPDATE_GC; - TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); + TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " + "%zu, entries %zu", + gcu_backlog_size(txn), err, txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); + tASSERT(txn, + err != MDBX_NOTFOUND || (txn->mt_flags & MDBX_TXN_DRAINED_GC) != 0); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -13416,17 +14272,16 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { MDBX_env *const env = txn->mt_env; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; - txn->mt_flags += MDBX_TXN_UPDATE_GC; - ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; + ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &ctx->cursor; /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. - * Page numbers cannot disappear from txn->tw.retired_pages[]. */ + * But page numbers cannot disappear from txn->tw.retired_pages[]. */ retry: - ++ctx->loop; - TRACE("%s", " >> restart"); + if (ctx->loop++) + TRACE("%s", " >> restart"); int rc = MDBX_SUCCESS; tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -13451,19 +14306,19 @@ retry: ctx->rid = txn->tw.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ - MDBX_val key, data; TRACE("%s", " >> continue"); if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || - ctx->retired_stored > env->me_maxgc_ov1page)) { - rc = gcu_prepare_backlog(txn, ctx, true); + (ctx->loop == 1 || ctx->retired_stored > env->me_maxgc_ov1page || + MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page)) { + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_val key, data; if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) @@ -13480,31 +14335,30 @@ retry: ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); + rc = cursor_set(&ctx->cursor, &key, NULL, MDBX_SET).err; if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, ctx->cleaned_slot, ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); - rc = mdbx_cursor_del(&ctx->cursor.outer, 0); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); txl_sort(txn->tw.lifo_reclaimed); } } else { - /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for tw.relist. */ + /* Удаляем оставшиеся вынутые из GC записи. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { - rc = cursor_first(&ctx->cursor.outer, &key, NULL); + rc = cursor_first(&ctx->cursor, &key, NULL); if (rc == MDBX_NOTFOUND) break; if (unlikely(rc != MDBX_SUCCESS)) @@ -13521,7 +14375,7 @@ retry: if (ctx->cleaned_id > txn->tw.last_reclaimed) break; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -13529,8 +14383,8 @@ retry: tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); - rc = mdbx_cursor_del(&ctx->cursor.outer, 0); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -13558,33 +14412,32 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { + tASSERT(txn, txn->tw.loose_count > 0); /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { - if (txn->tw.loose_count > 0) { - TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, - txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE) - .err; - if (rc == MDBX_SUCCESS) { - TRACE("%s: retry since gc-slot for %zu loose-pages available", - dbg_prefix_mode, txn->tw.loose_count); - continue; - } - - /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.relist. */ - if (unlikely((rc = pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) - goto bailout; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, - txn->tw.loose_count); + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, + txn->tw.loose_count); + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + if (rc == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %zu loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; } + + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to tw.relist. */ + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) + goto bailout; + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + pnl_xappend(txn->tw.retired_pages, lp->mp_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } else { /* Room for loose pages + temp PNL with same */ rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); @@ -13593,9 +14446,11 @@ retry: MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; size_t count = 0; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { - tASSERT(txn, mp->mp_flags == P_LOOSE); - loose[++count] = mp->mp_pgno; + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); + loose[++count] = lp->mp_pgno; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SETSIZE(loose, count); @@ -13609,7 +14464,8 @@ retry: MDBX_dpl *const dl = txn->tw.dirtylist; if (dl) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - size_t w = 0; + tASSERT(txn, dl->sorted <= dl->length); + size_t w = 0, sorted_out = 0; for (size_t r = w; ++r <= dl->length;) { MDBX_page *dp = dl->items[r].ptr; tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); @@ -13619,6 +14475,7 @@ retry: dl->items[w] = dl->items[r]; } else { tASSERT(txn, dp->mp_flags == P_LOOSE); + sorted_out += dl->sorted >= r; if (!MDBX_AVOID_MSYNC || !(env->me_flags & MDBX_WRITEMAP)) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); dpage_free(env, dp, 1); @@ -13628,8 +14485,9 @@ retry: TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", dbg_prefix_mode, dl->length, w); tASSERT(txn, txn->tw.loose_count == dl->length - w); + dl->sorted -= sorted_out; + tASSERT(txn, dl->sorted <= w); dpl_setlen(dl, w); - dl->sorted = 0; dl->pages_including_loose -= txn->tw.loose_count; txn->tw.dirtyroom += txn->tw.loose_count; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == @@ -13650,10 +14508,9 @@ retry: if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - txn->mt_flags -= MDBX_TXN_UPDATE_GC; - rc = page_search(&ctx->cursor.outer, NULL, - MDBX_PS_LAST | MDBX_PS_MODIFY); - txn->mt_flags += MDBX_TXN_UPDATE_GC; + rc = cursor_last(&ctx->cursor, nullptr, nullptr); + if (likely(rc != MDBX_SUCCESS)) + rc = gcu_touch(ctx); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } @@ -13663,18 +14520,37 @@ retry: do { if (ctx->bigfoot > txn->mt_txnid) { rc = gcu_clean_stored_retired(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); } retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); - rc = gcu_prepare_backlog(txn, ctx, true); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix_mode, + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); ctx->retired_stored = 0; ctx->bigfoot = txn->mt_txnid; do { + if (ctx->retired_stored) { + rc = gcu_prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (ctx->retired_stored >= + MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", + dbg_prefix_mode, retired_pages_before, + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } + } key.iov_len = sizeof(txnid_t); key.iov_base = &ctx->bigfoot; const size_t left = @@ -13684,7 +14560,7 @@ retry: ? env->me_maxgc_ov1page : left; data.iov_len = (chunk + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -13720,9 +14596,9 @@ retry: key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { - gcu_prepare_backlog(txn, ctx, true); + gcu_prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ @@ -13733,7 +14609,7 @@ retry: eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix_mode, ctx->retired_stored, txn->mt_txnid); #endif /* MDBX_ENABLE_BIGFOOT */ if (LOG_ENABLED(MDBX_LOG_EXTRA)) { @@ -13775,7 +14651,7 @@ retry: if (0 >= (intptr_t)left) break; - const size_t prefer_max_scatter = 257; + const size_t prefer_max_scatter = MDBX_ENABLE_BIGFOOT ? MDBX_TXL_MAX : 257; txnid_t reservation_gc_id; if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { @@ -13789,17 +14665,13 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page && !ctx->dense) { - /* LY: need just a txn-id for save page list. */ + /* Hужен свободный для для сохранения списка страниц. */ bool need_cleanup = false; - txnid_t snap_oldest; + txnid_t snap_oldest = 0; retry_rid: - txn->mt_flags -= MDBX_TXN_UPDATE_GC; do { - snap_oldest = txn_oldest_reader(txn); - rc = page_alloc_slowpath(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE) - .err; + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + snap_oldest = env->me_lck->mti_oldest_reader.weak; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); @@ -13811,7 +14683,6 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page); - txn->mt_flags += MDBX_TXN_UPDATE_GC; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: got enough from GC.", dbg_prefix_mode); @@ -13829,7 +14700,7 @@ retry: } else { tASSERT(txn, txn->tw.last_reclaimed == 0); if (unlikely(txn_oldest_reader(txn) != snap_oldest)) - /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) + /* should retry page_alloc_slowpath() * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, @@ -13839,7 +14710,8 @@ retry: ctx->rid); } - /* LY: GC is empty, will look any free txn-id in high2low order. */ + /* В GC нет годных к переработке записей, + * будем использовать свободные id в обратном порядке. */ while (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * @@ -13857,26 +14729,20 @@ retry: } tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); - --ctx->rid; + ctx->rid -= 1; key.iov_base = &ctx->rid; key.iov_len = sizeof(ctx->rid); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc == MDBX_SUCCESS)) { - DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", + DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", dbg_prefix_mode, ctx->rid); - ++ctx->rid; - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); - if (rc == MDBX_NOTFOUND) { - DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode); - ctx->dense = true; - break; - } + rc = cursor_first(&ctx->cursor, &key, nullptr); if (unlikely(rc != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (gc_first <= MIN_TXNID) { DEBUG("%s: no free GC's id(s) less than %" PRIaTXN " (going dense-mode)", @@ -13906,10 +14772,11 @@ retry: } if (need_cleanup || ctx->dense) { - if (ctx->cleaned_slot) - TRACE("%s: restart inner-loop to clear and re-create GC entries", + if (ctx->cleaned_slot) { + TRACE("%s: restart to clear and re-create GC entries", dbg_prefix_mode); - ctx->cleaned_slot = 0; + goto retry; + } continue; } } @@ -13924,13 +14791,13 @@ retry: tASSERT(txn, txn->tw.lifo_reclaimed == NULL); if (unlikely(ctx->rid == 0)) { ctx->rid = txn_oldest_reader(txn); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); - if (rc == MDBX_SUCCESS) { + rc = cursor_first(&ctx->cursor, &key, nullptr); + if (likely(rc == MDBX_SUCCESS)) { if (unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (ctx->rid >= gc_first) ctx->rid = gc_first - 1; if (unlikely(ctx->rid == 0)) { @@ -13957,6 +14824,11 @@ retry: : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid : INT16_MAX; if (avail_gc_slots > 1) { +#if MDBX_ENABLE_BIGFOOT + chunk = (chunk < env->me_maxgc_ov1page * (size_t)2) + ? chunk / 2 + : env->me_maxgc_ov1page; +#else if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { @@ -13993,6 +14865,7 @@ retry: : tail; } } +#endif /* MDBX_ENABLE_BIGFOOT */ } } tASSERT(txn, chunk > 0); @@ -14020,9 +14893,9 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); - gcu_prepare_backlog(txn, ctx, true); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, - MDBX_RESERVE | MDBX_NOOVERWRITE); + gcu_prepare_backlog(txn, ctx); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) @@ -14069,7 +14942,7 @@ retry: size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); - rc = cursor_first(&ctx->cursor.outer, &key, &data); + rc = cursor_first(&ctx->cursor, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { @@ -14103,7 +14976,7 @@ retry: dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -14117,7 +14990,6 @@ retry: key.iov_len = sizeof(fill_gc_id); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); - txn->mt_flags += MDBX_TXN_FROZEN_RE; size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, @@ -14125,14 +14997,11 @@ retry: if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (ctx->loop < 7) - txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; } chunk = left; } - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, - MDBX_CURRENT | MDBX_RESERVE); - txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_CURRENT | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); @@ -14181,7 +15050,7 @@ retry: if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); - rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); + rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { @@ -14212,7 +15081,7 @@ retry: ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; + txn->mt_cursors[FREE_DBI] = ctx->cursor.mc_next; MDBX_PNL_SETSIZE(txn->tw.relist, 0); #if MDBX_ENABLE_PROFGC @@ -14226,7 +15095,7 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *const dl = dpl_sort(txn); int rc = MDBX_SUCCESS; - size_t r, w; + size_t r, w, total_npages = 0; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; if (dp->mp_flags & P_LOOSE) { @@ -14234,9 +15103,10 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { continue; } unsigned npages = dpl_npages(dl, r); + total_npages += npages; rc = iov_page(txn, ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) - break; + return rc; } if (!iov_empty(ctx)) { @@ -14244,6 +15114,13 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { rc = iov_write(ctx); } + if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->mt_env->me_lazy_fd) { + txn->mt_env->me_lck->mti_unsynced_pages.weak += total_npages; + if (!txn->mt_env->me_lck->mti_eoos_timestamp.weak) + txn->mt_env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); + } + + txn->tw.dirtylist->pages_including_loose -= total_npages; while (r <= dl->length) dl->items[++w] = dl->items[r++]; @@ -14252,6 +15129,8 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count); return rc; } @@ -14270,10 +15149,6 @@ static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, return dbi_import(txn, dbi); } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - /* Merge child txn into parent */ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { @@ -14362,7 +15237,8 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); /* Filter-out parent spill list */ - if (parent->tw.spill_pages && MDBX_PNL_GETSIZE(parent->tw.spill_pages) > 0) { + if (parent->tw.spilled.list && + MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) { const MDBX_PNL sl = spill_purge(parent); size_t len = MDBX_PNL_GETSIZE(sl); if (len) { @@ -14377,7 +15253,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_GETSIZE(sl) = i; + MDBX_PNL_SETSIZE(sl, i); #else assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); size_t i = 0; @@ -14450,10 +15326,10 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } /* Remove anything in our spill list from parent's dirty list */ - if (txn->tw.spill_pages) { - tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages, + if (txn->tw.spilled.list) { + tASSERT(txn, pnl_check_allocated(txn->tw.spilled.list, (size_t)parent->mt_next_pgno << 1)); - dpl_sift(parent, txn->tw.spill_pages, true); + dpl_sift(parent, txn->tw.spilled.list, true); tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == (parent->mt_parent ? parent->mt_parent->tw.dirtyroom @@ -14605,51 +15481,88 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, tASSERT(parent, dirtylist_check(parent)); dpl_free(txn); - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { /* Must not fail since space was preserved above. */ - pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); - pnl_free(txn->tw.spill_pages); + pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list); + pnl_free(txn->tw.spilled.list); } else { - parent->tw.spill_pages = txn->tw.spill_pages; - parent->tw.spill_least_removed = txn->tw.spill_least_removed; + parent->tw.spilled.list = txn->tw.spilled.list; + parent->tw.spilled.least_removed = txn->tw.spilled.least_removed; } tASSERT(parent, dirtylist_check(parent)); } parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - if (parent->tw.spill_pages) { - assert(pnl_check_allocated(parent->tw.spill_pages, + if (parent->tw.spilled.list) { + assert(pnl_check_allocated(parent->tw.spilled.list, (size_t)parent->mt_next_pgno << 1)); - if (MDBX_PNL_GETSIZE(parent->tw.spill_pages)) + if (MDBX_PNL_GETSIZE(parent->tw.spilled.list)) parent->mt_flags |= MDBX_TXN_SPILLS; } } +static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { + MDBX_env *const env = txn->mt_env; + if (MDBX_ENABLE_PROFGC) { + pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; + + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; + + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->me_txn0) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); + } else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +} + int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); const uint64_t ts_0 = latency ? osal_monotime() : 0; uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; - MDBX_env *const env = txn->mt_env; int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - goto provide_latency; + if (unlikely(rc != MDBX_SUCCESS)) { + if (latency) + memset(latency, 0, sizeof(*latency)); + return rc; + } + + MDBX_env *const env = txn->mt_env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != osal_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + if (latency) + memset(latency, 0, sizeof(*latency)); + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { rc = MDBX_RESULT_TRUE; goto fail; } -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - rc = MDBX_PANIC; - goto provide_latency; - } -#endif /* MDBX_ENV_CHECKPID */ - /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; @@ -14692,8 +15605,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { sizeof(parent->mt_geo)) == 0); tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, sizeof(parent->mt_canary)) == 0); - tASSERT(txn, !txn->tw.spill_pages || - MDBX_PNL_GETSIZE(txn->tw.spill_pages) == 0); + tASSERT(txn, !txn->tw.spilled.list || + MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ @@ -14713,10 +15626,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { - rc = pnl_need(&parent->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages)); + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + rc = pnl_need(&parent->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -14789,9 +15702,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { txn_refund(parent); if (ASSERT_ENABLED()) { /* Check parent's loose pages not suitable for refund */ - for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) + for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = mp_next(lp)) { tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && lp->mp_pgno + 1 < parent->mt_next_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } /* Check parent's reclaimed pages not suitable for refund */ if (MDBX_PNL_GETSIZE(parent->tw.relist)) tASSERT(parent, @@ -14855,10 +15771,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Может быть mod_txnid > front после коммита вложенных тразакций */ db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, - &txn->mt_dbxs[i].md_name, - &data, F_SUBDATA)); + WITH_CURSOR_TRACKING( + couple.outer, + rc = cursor_put_nochecklen(&couple.outer, &txn->mt_dbxs[i].md_name, + &data, F_SUBDATA)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -14877,6 +15793,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (unlikely(rc != MDBX_SUCCESS)) goto fail; + tASSERT(txn, txn->tw.loose_count == 0); txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[FREE_DBI].md_mod_txnid; @@ -14894,23 +15811,74 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } + bool need_flush_for_nometasync = false; const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, - mo_Relaxed) != (uint32_t)head.txnid) { - /* sync prev meta */ - rc = meta_sync(env, head); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "presync-meta", rc); - goto fail; + const uint32_t meta_sync_txnid = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); + /* sync prev meta */ + if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { + /* Исправление унаследованного от LMDB недочета: + * + * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате fdatasync() при записи данных этой транзакции. + * + * Всё хорошо, если все процессы работающие с БД используют WRITEMAP + * без MDBX_AVOID_MSYNC. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате msync() при записи данных этой транзакции. + * + * Если же в процессах работающих с БД используется оба метода, как sync() + * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то + * становится невозможным обеспечить фиксацию на диске мета-страницы + * предыдущей транзакции и данных текущей транзакции, за счет одной + * sync-операцией выполняемой после записи данных текущей транзакции. + * Соответственно, требуется явно обновлять мета-страницу, что полностью + * уничтожает выгоду от NOMETASYNC. */ + const uint32_t txnid_dist = + ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() + * или msync() для гарантированной фиксации на диске мета-страницы, + * которая была "лениво" отправлена на запись в предыдущей транзакции, + * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ + if ( +#if defined(_WIN32) || defined(_WIN64) + !env->me_overlapped_fd && +#endif + meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) + need_flush_for_nometasync = true; + else { + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); + goto fail; + } } } if (txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.loose_count == 0); + + mdbx_filehandle_t fd = +#if defined(_WIN32) || defined(_WIN64) + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + (void)need_flush_for_nometasync; +#else +#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2 + (need_flush_for_nometasync || + env->me_dsync_fd == INVALID_HANDLE_VALUE || + txn->tw.dirtylist->length > env->me_options.writethrough_threshold || + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) + ? env->me_lazy_fd + : env->me_dsync_fd; +#endif /* Windows */ + iov_ctx_t write_ctx; rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, - txn->tw.dirtylist->pages_including_loose - - txn->tw.loose_count); + txn->tw.dirtylist->pages_including_loose, fd, false); if (unlikely(rc != MDBX_SUCCESS)) { ERROR("txn-%s: error %d", "iov-init", rc); goto fail; @@ -14923,6 +15891,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } } else { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); } /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ @@ -14964,6 +15935,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: + if (latency) + take_gcprof(txn, latency); rc = txn_end(txn, end_mode); provide_latency: @@ -14975,42 +15948,6 @@ provide_latency: latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - -#if MDBX_ENABLE_PROFGC - pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; - latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; - latency->gc_prof.work_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); - latency->gc_prof.work_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); - latency->gc_prof.work_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); - latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; - latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; - latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; - - latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; - latency->gc_prof.self_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); - latency->gc_prof.self_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); - latency->gc_prof.self_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); - latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; - latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; - latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; - - latency->gc_prof.wloops = ptr->gc_prof.wloops; - latency->gc_prof.coalescences = ptr->gc_prof.coalescences; - latency->gc_prof.wipes = ptr->gc_prof.wipes; - latency->gc_prof.flushes = ptr->gc_prof.flushes; - latency->gc_prof.kicks = ptr->gc_prof.kicks; - if (txn == env->me_txn0) - memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); -#else - memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); -#endif /* MDBX_ENABLE_PROFGC */ - const uint64_t ts_6 = osal_monotime(); latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); @@ -15019,10 +15956,160 @@ provide_latency: fail: txn->mt_flags |= MDBX_TXN_ERROR; + if (latency) + take_gcprof(txn, latency); mdbx_txn_abort(txn); goto provide_latency; } +static __always_inline int cmp_int_inline(const size_t expected_alignment, + const MDBX_val *a, + const MDBX_val *b) { + if (likely(a->iov_len == b->iov_len)) { + if (sizeof(size_t) > 7 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + if (likely(a->iov_len == 4)) + return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base), + unaligned_peek_u32(expected_alignment, b->iov_base)); + if (sizeof(size_t) < 8 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + } + ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", + a->iov_base, a->iov_len, b->iov_base, b->iov_len); + return 0; +} + +__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(1, a, b); +} + +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 2 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(2, a, b); +} +#else +#define cmp_int_align2 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items pointing at aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 4 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(4, a, b); +} +#else +#define cmp_int_align4 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items lexically */ +__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { + if (a->iov_len == b->iov_len) + return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; + return likely(diff_data) ? diff_data : diff_len; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +tail3le(const uint8_t *p, size_t l) { + STATIC_ASSERT(sizeof(unsigned) > 2); + // 1: 0 0 0 + // 2: 0 1 1 + // 3: 0 1 2 + return p[0] | p[l >> 1] << 8 | p[l - 1] << 16; +} + +/* Compare two items in reverse byte order */ +__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { + size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + if (likely(left)) { + const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); + const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); + while (left >= sizeof(size_t)) { + pa -= sizeof(size_t); + pb -= sizeof(size_t); + left -= sizeof(size_t); + STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8); + if (sizeof(size_t) == 4) { + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } else { + uint64_t xa = unaligned_peek_u64(1, pa); + uint64_t xb = unaligned_peek_u64(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap64(xa); + xb = osal_bswap64(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + } + if (sizeof(size_t) == 8 && left >= 4) { + pa -= 4; + pb -= 4; + left -= 4; + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + if (left) { + unsigned xa = tail3le(pa - left, left); + unsigned xb = tail3le(pb - left, left); + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + } + return CMP2INT(a->iov_len, b->iov_len); +} + +/* Fast non-lexically comparator */ +__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { + int diff = CMP2INT(a->iov_len, b->iov_len); + return (likely(diff) || a->iov_len == 0) + ? diff + : memcmp(a->iov_base, b->iov_base, a->iov_len); +} + +__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, + size_t l) { + if (likely(l > 3)) { + if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) + return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | + (unaligned_peek_u32(1, a + l - 4) - + unaligned_peek_u32(1, b + l - 4))) == 0; + if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17)) + return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) | + (unaligned_peek_u64(1, a + l - 8) - + unaligned_peek_u64(1, b + l - 8))) == 0; + return memcmp(a, b, l) == 0; + } + if (likely(l)) + return tail3le(a, l) == tail3le(b, l); + return true; +} + +static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { + return unlikely(a->iov_len == b->iov_len) && + eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); +} + static int validate_meta(MDBX_env *env, MDBX_meta *const meta, const MDBX_page *const page, const unsigned meta_number, unsigned *guess_pagesize) { @@ -15275,7 +16362,7 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, MDBX_meta *dest) { *dest = *meta; return validate_meta(env, dest, data_page(meta), - bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr); + bytes2pgno(env, ptr_dist(meta, env->me_map)), nullptr); } /* Read the environment parameters of a DB environment @@ -15443,8 +16530,8 @@ __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; meta_set_txnid(env, model_meta, MIN_TXNID + num); unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); - eASSERT(env, coherency_check_meta(env, model_meta, true)); - return (MDBX_page *)((uint8_t *)model + env->me_psize); + eASSERT(env, check_meta_coherency(env, model_meta, true)); + return ptr_disp(model, env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. @@ -15457,20 +16544,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { return page_meta(page2); } -#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) -static size_t madvise_threshold(const MDBX_env *env, - const size_t largest_bytes) { - /* TODO: use options */ - const unsigned factor = 9; - const size_t threshold = (largest_bytes < (65536ul << factor)) - ? 65536 /* minimal threshold */ - : (largest_bytes > (MEGABYTE * 4 << factor)) - ? MEGABYTE * 4 /* maximal threshold */ - : largest_bytes >> factor; - return bytes_align2os_bytes(env, threshold); -} -#endif /* MDBX_ENABLE_MADVISE */ - static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); @@ -15504,126 +16577,132 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, pgno_t shrink = 0; if (flags & MDBX_SHRINK_ALLOWED) { - /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = find_largest_snapshot( - env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) - ? head.ptr_c->mm_geo.next - : pending->mm_geo.next); - eASSERT(env, largest_pgno >= NUM_METAS); + const size_t prev_discarded_pgno = + atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); + if (prev_discarded_pgno < pending->mm_geo.next) + env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next; + else if (prev_discarded_pgno >= + pending->mm_geo.next + env->me_madv_threshold) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = find_largest_snapshot( + env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) + ? head.ptr_c->mm_geo.next + : pending->mm_geo.next); + eASSERT(env, largest_pgno >= NUM_METAS); + #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - const pgno_t edge = env->me_poison_edge; - if (edge > largest_pgno) { - env->me_poison_edge = largest_pgno; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + - pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); - } + const pgno_t edge = env->me_poison_edge; + if (edge > largest_pgno) { + env->me_poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + } #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + #if MDBX_ENABLE_MADVISE && \ (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) - const size_t largest_bytes = pgno2bytes(env, largest_pgno); - /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t threshold = madvise_threshold(env, largest_bytes); - const size_t discard_edge_bytes = bytes_align2os_bytes( - env, ((MDBX_RDONLY & - (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak - : env->me_flags)) - ? largest_bytes - : largest_bytes + threshold)); - const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); - const pgno_t prev_discarded_pgno = - atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { - NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno, - prev_discarded_pgno); - atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, - mo_Relaxed); - const size_t prev_discarded_bytes = - ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); - ENSURE(env, prev_discarded_bytes > discard_edge_bytes); - munlock_after(env, discard_edge_pgno, - bytes_align2os_bytes(env, env->me_dxb_mmap.current)); - const uint32_t munlocks_before = - atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); + const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) { + const size_t prev_discarded_bytes = + pgno_align2os_bytes(env, prev_discarded_pgno); + const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno); + /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes + * могут быть равны */ + if (prev_discarded_bytes > discard_edge_bytes) { + NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno, + prev_discarded_pgno); + munlock_after(env, discard_edge_pgno, + bytes_align2os_bytes(env, env->me_dxb_mmap.current)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); #if defined(MADV_DONTNEED) - int advise = MADV_DONTNEED; + int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) - advise = MADV_FREE; + if ((env->me_flags & MDBX_WRITEMAP) && + linux_kernel_version > 0x04050000) + advise = MADV_FREE; #endif /* MADV_FREE */ - int err = madvise(env->me_map + discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, advise) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #else - int err = ignore_enosys(posix_madvise( - env->me_map + discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); + int err = ignore_enosys(posix_madvise( + ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif - if (unlikely(MDBX_IS_ERROR(err))) { - const uint32_t mlocks_after = - atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); - if (err == MDBX_EINVAL) { - const int severity = (mlocks_after - munlocks_before) - ? MDBX_LOG_NOTICE - : MDBX_LOG_WARN; - if (LOG_ENABLED(severity)) - debug_log(severity, __func__, __LINE__, - "%s-madvise: ignore EINVAL (%d) since some pages maybe " - "locked (%u/%u mlcnt-processes)", - "shrink", err, mlocks_after, munlocks_before); - } else { - ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", - "shrink", "DONTNEED", discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, mlocks_after, - munlocks_before, err); - return err; + if (unlikely(MDBX_IS_ERROR(err))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (err == MDBX_EINVAL) { + const int severity = (mlocks_after - munlocks_before) + ? MDBX_LOG_NOTICE + : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log( + severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "shrink", err, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, mlocks_after, + munlocks_before, err); + return err; + } + } else + env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; } - } else - env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; - } + } #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ - /* LY: check conditions to shrink datafile */ - const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; - pgno_t shrink_step = 0; - if (pending->mm_geo.shrink_pv && - pending->mm_geo.now - pending->mm_geo.next > - (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { - if (pending->mm_geo.now > largest_pgno && - pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow_pv - ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) - : shrink_step; - const pgno_t with_backlog_gap = largest_pgno + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - if (TROIKA_HAVE_STEADY(troika)) - /* force steady, but only if steady-checkpoint is present */ - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (unlikely(head.txnid == pending->unsafe_txnid)) { - const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); - NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - pending->unsafe_txnid, txnid); - ENSURE(env, !env->me_txn0 || - (env->me_txn0->mt_owner != osal_thread_self() && - !env->me_txn)); - if (unlikely(txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - ERROR("txnid overflow, raise %d", rc); - goto fail; + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; + pgno_t shrink_step = 0; + if (pending->mm_geo.shrink_pv && + pending->mm_geo.now - pending->mm_geo.next > + (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + + backlog_gap) { + if (pending->mm_geo.now > largest_pgno && + pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow_pv + ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) + : shrink_step; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = + pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - + with_backlog_gap % aligner); + const pgno_t bottom = (aligned > pending->mm_geo.lower) + ? aligned + : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + if (TROIKA_HAVE_STEADY(troika)) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + pending->unsafe_txnid, txnid); + ENSURE(env, !env->me_txn0 || + (env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn)); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto fail; + } + meta_set_txnid(env, pending, txnid); + eASSERT(env, check_meta_coherency(env, pending, true)); } - meta_set_txnid(env, pending, txnid); - eASSERT(env, coherency_check_meta(env, pending, true)); } } } @@ -15644,8 +16723,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; - } - if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + } else if (unlikely(env->me_incore)) + goto skip_incore_sync; + if (flags & MDBX_WRITEMAP) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; #else @@ -15667,7 +16747,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; } - eASSERT(env, coherency_check_meta(env, pending, true)); + eASSERT(env, check_meta_coherency(env, pending, true)); /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { @@ -15676,6 +16756,10 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + skip_incore_sync: + eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); + /* Может быть нулевым если unsynced_pages > 0 в результате спиллинга. + * eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); */ unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } @@ -15751,9 +16835,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * to allow readers catch actual pagesize. */ - uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; - uint8_t *provoke_end = (uint8_t *)&target->mm_sign; - memset(provoke_begin, 0xCC, provoke_end - provoke_begin); + void *provoke_begin = &target->mm_dbs[FREE_DBI].md_root; + void *provoke_end = &target->mm_sign; + memset(provoke_begin, 0xCC, ptr_dist(provoke_end, provoke_begin)); jitter4testing(false); #endif @@ -15768,7 +16852,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, /* LY: 'commit' the meta */ meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); jitter4testing(true); - eASSERT(env, coherency_check_meta(env, target, true)); + eASSERT(env, check_meta_coherency(env, target, true)); } else { /* dangerous case (target == head), only mm_sign could * me updated, check assertions once again */ @@ -15778,54 +16862,58 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, memcpy(target->mm_sign, pending->mm_sign, 8); osal_flush_incoherent_cpu_writeback(); jitter4testing(true); - if (!MDBX_AVOID_MSYNC) { - /* sync meta-pages */ + if (!env->me_incore) { + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) - ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - } else { + rc = osal_msync( + &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(target); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); - if (likely(rc == MDBX_SUCCESS)) { - osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); - if ((flags & MDBX_NOMETASYNC) == 0 && - env->me_fd4meta == env->me_lazy_fd) { + const MDBX_page *page = data_page(target); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + ptr_dist(page, env->me_map)); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), + env->me_os_psize); + if ((flags & MDBX_NOMETASYNC) == 0 && + env->me_fd4meta == env->me_lazy_fd) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; + env->me_lck->mti_pgop_stat.fsync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } } } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; } else { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_meta undo_meta = *target; rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + ptr_dist(target, env->me_map)); if (unlikely(rc != MDBX_SUCCESS)) { undo: DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + ptr_dist(target, env->me_map)); goto fail; } osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) { + if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd && + !env->me_incore) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.fsync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -15836,17 +16924,24 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } uint64_t timestamp = 0; - while ("workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { - rc = - coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_written(env, pending->unsafe_txnid, target, + bytes2pgno(env, ptr_dist(target, env->me_map)), + ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto fail; } + + const uint32_t sync_txnid_dist = + ((flags & MDBX_NOMETASYNC) == 0) ? 0 + : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; env->me_lck->mti_meta_sync_txnid.weak = pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); + sync_txnid_dist; *troika = meta_tap(env); for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) @@ -15857,11 +16952,11 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (unlikely(shrink)) { VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", pending->mm_geo.now, shrink); - rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper); + rc = dxb_resize(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper, impilict_shrink); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; - eASSERT(env, coherency_check_meta(env, target, true)); + eASSERT(env, check_meta_coherency(env, target, true)); } MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -15907,6 +17002,9 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { ENSURE(env, maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; + env->me_maxgc_per_branch = + (unsigned)((pagesize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); @@ -16005,28 +17103,28 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data = -#if defined(_WIN32) || defined(_WIN64) - env->me_overlapped_fd = -#endif /* Windows */ - env->me_lfd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = + INVALID_HANDLE_VALUE; env->me_pid = osal_getpid(); env->me_stuck_meta = -1; - env->me_options.dp_reserve_limit = 1024; - env->me_options.rp_augment_limit = 256 * 1024; - env->me_options.dp_limit = MDBX_DEBUG ? 64 * 1024 / 42 : 64 * 1024; - if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS) - env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS; + env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; + env->me_options.dp_reserve_limit = MDBX_PNL_INITIAL; env->me_options.dp_initial = MDBX_PNL_INITIAL; - if (env->me_options.dp_initial > env->me_options.dp_limit) - env->me_options.dp_initial = env->me_options.dp_limit; env->me_options.spill_max_denominator = 8; env->me_options.spill_min_denominator = 8; env->me_options.spill_parent4child_denominator = 0; env->me_options.dp_loose_limit = 64; env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; +#if !(defined(_WIN32) || defined(_WIN64)) + env->me_options.writethrough_threshold = +#if defined(__linux__) || defined(__gnu_linux__) + mdbx_RunningOnWSL1 ? MAX_PAGENO : +#endif /* Linux */ + MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; +#endif /* Windows */ + env->me_os_psize = (unsigned)os_psize; setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize : MAX_PAGESIZE); @@ -16097,10 +17195,10 @@ __cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) { return *cached_result; } -__cold LIBMDBX_API int -mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, - intptr_t size_upper, intptr_t growth_step, - intptr_t shrink_threshold, intptr_t pagesize) { +__cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, + intptr_t size_now, intptr_t size_upper, + intptr_t growth_step, + intptr_t shrink_threshold, intptr_t pagesize) { int rc = check_env(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -16324,6 +17422,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); + adjust_defaults(env); ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); @@ -16351,22 +17450,20 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, ENSURE(env, pagesize == (intptr_t)env->me_psize); MDBX_meta meta; memset(&meta, 0, sizeof(meta)); - const MDBX_geo *current_geo; if (!inside_txn) { eASSERT(env, need_unlock); const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; while ("workaround for " - "https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { - meta = *head.ptr_c; - rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, - ×tamp); + "https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_head(env->me_txn0, head, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } + meta = *head.ptr_c; const txnid_t txnid = safe64_txnid_next(head.txnid); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; @@ -16374,11 +17471,17 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, goto bailout; } meta_set_txnid(env, &meta, txnid); - current_geo = &meta.mm_geo; - } else { - current_geo = &env->me_txn->mt_geo; } + const MDBX_geo *const current_geo = + &(env->me_txn ? env->me_txn : env->me_txn0)->mt_geo; + /* update env-geo to avoid influences */ + env->me_dbgeo.now = pgno2bytes(env, current_geo->now); + env->me_dbgeo.lower = pgno2bytes(env, current_geo->lower); + env->me_dbgeo.upper = pgno2bytes(env, current_geo->upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(current_geo->grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(current_geo->shrink_pv)); + MDBX_geo new_geo; new_geo.lower = bytes2pgno(env, size_lower); new_geo.now = bytes2pgno(env, size_now); @@ -16440,8 +17543,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper, - false); + rc = dxb_resize(env, current_geo->next, new_geo.now, new_geo.upper, + explicit_resize); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -16451,16 +17554,20 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } else { meta.mm_geo = new_geo; rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); + if (likely(rc == MDBX_SUCCESS)) { + env->me_dbgeo.now = pgno2bytes(env, new_geo.now = meta.mm_geo.now); + env->me_dbgeo.upper = + pgno2bytes(env, new_geo.upper = meta.mm_geo.upper); + } } - - if (likely(rc == MDBX_SUCCESS)) { - /* store new geo to env to avoid influences */ - env->me_dbgeo.now = pgno2bytes(env, new_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); - } + } + if (likely(rc == MDBX_SUCCESS)) { + /* update env-geo to avoid influences */ + eASSERT(env, env->me_dbgeo.now == pgno2bytes(env, new_geo.now)); + env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); + eASSERT(env, env->me_dbgeo.upper == pgno2bytes(env, new_geo.upper)); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); } } @@ -16470,33 +17577,11 @@ bailout: return rc; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { - return __inline_mdbx_env_set_mapsize(env, size); -} - -__cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { - return __inline_mdbx_env_set_maxdbs(env, dbs); -} - -__cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { - return __inline_mdbx_env_get_maxdbs(env, dbs); -} - -__cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { - return __inline_mdbx_env_set_maxreaders(env, readers); -} - -__cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { - return __inline_mdbx_env_get_maxreaders(env, readers); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - __cold static int alloc_page_buf(MDBX_env *env) { - return env->me_pbuf - ? MDBX_SUCCESS - : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, - &env->me_pbuf); + return env->me_pbuf ? MDBX_SUCCESS + : osal_memalign_alloc(env->me_os_psize, + env->me_psize * (size_t)NUM_METAS, + &env->me_pbuf); } /* Further setup required for opening an MDBX environment */ @@ -16526,8 +17611,8 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; header = *init_metas(env, env->me_pbuf); - err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, - 0); + err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, + env->me_psize * (size_t)NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -16569,8 +17654,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, pv2pages(header.mm_geo.grow_pv) * pagesize, pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - ERROR("%s: err %d", "could not apply preconfigured geometry from db", - err); + ERROR("%s: err %d", "could not apply geometry from db", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } } else if (env->me_dbgeo.now) { @@ -16719,9 +17803,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, + MDBX_ASAN_POISON_MEMORY_REGION(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); } env->me_poison_edge = @@ -16743,7 +17827,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, err = validate_meta_copy(env, target, &clone); if (unlikely(err != MDBX_SUCCESS)) { ERROR("target meta[%u] is corrupted", - bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + bytes2pgno(env, ptr_dist(data_page(target), env->me_map))); meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } @@ -16785,7 +17869,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); if (unlikely(err != MDBX_SUCCESS)) { ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), + bytes2pgno(env, ptr_dist(prefer_steady.ptr_c, env->me_map)), "steady", prefer_steady.txnid, "manual recovery"); meta_troika_dump(env, &troika); return MDBX_CORRUPTED; @@ -16794,8 +17878,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, break; } - const pgno_t pgno = - bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map); + const pgno_t pgno = bytes2pgno(env, ptr_dist(recent.ptr_c, env->me_map)); const bool last_valid = validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; eASSERT(env, @@ -16982,7 +18065,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, env->me_lck->mti_discarded_tail.weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -16995,7 +18078,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, env->me_lck->mti_discarded_tail.weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -17003,7 +18086,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; #elif defined(POSIX_MADV_DONTNEED) err = ignore_enosys(posix_madvise( - env->me_map + used_aligned2os_bytes, + ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -17190,14 +18273,10 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, + MDBX_SYNC_DATA | MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { - ERROR("initial-%s for lck-file failed", "msync"); - goto bailout; - } - err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("initial-%s for lck-file failed", "fsync"); + ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); goto bailout; } } else { @@ -17279,8 +18358,8 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; - /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ - if (r & MDBX_SAFE_NOSYNC) + /* force MDBX_NOMETASYNC if NOSYNC enabled */ + if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC)) r |= MDBX_NOMETASYNC; assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && @@ -17300,9 +18379,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env, meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); - eASSERT(env, coherency_check_meta(env, model, true)); + if (txnid) + eASSERT(env, check_meta_coherency(env, model, true)); if (shape) { - if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { + if (txnid && unlikely(!check_meta_coherency(env, shape, false))) { ERROR("bailout overriding meta-%zu since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, target, "pre", constmeta_txnid(shape)); @@ -17326,7 +18406,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); - if (unlikely(!coherency_check_meta(env, model, false))) { + if (unlikely(!check_meta_coherency(env, model, false))) { ERROR("bailout overriding meta-%zu since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, target, "post", txnid); @@ -17376,7 +18456,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env, osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); } - eASSERT(env, !env->me_txn && !env->me_txn0); + eASSERT(env, (!env->me_txn && !env->me_txn0) || + (env->me_stuck_meta == (int)target && + (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE)); return rc; } @@ -17419,9 +18502,13 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + osal_free(pathnameW); + } + return rc; } __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, @@ -17453,20 +18540,15 @@ typedef struct { size_t ent_len; } MDBX_handle_env_pathname; -static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { -#if defined(_WIN32) || defined(_WIN64) - while (len > 0) { - pathchar_t a = *l++; - pathchar_t b = *r++; - a = (a == '\\') ? '/' : a; - b = (b == '\\') ? '/' : b; - if (a != b) - return false; +__cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { + int err = osal_fileexists(lck_pathname); + if (unlikely(err != MDBX_RESULT_FALSE)) { + if (err == MDBX_RESULT_TRUE) + err = MDBX_DUPLICATED_CLK; + ERROR("Alternative/Duplicate LCK-file '%" MDBX_PRIsPATH "' error %d", + lck_pathname, err); } - return true; -#else - return memcmp(l, r, len * sizeof(pathchar_t)) == 0; -#endif + return err; } __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, @@ -17502,13 +18584,13 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, } #else struct stat st; - if (stat(pathname, &st)) { + if (stat(pathname, &st) != 0) { rc = errno; if (rc != MDBX_ENOFILE) return rc; if (mode == 0 || (*flags & MDBX_RDONLY) != 0) - /* can't open existing */ - return rc; + /* can't open non-existing */ + return rc /* MDBX_ENOFILE */; /* auto-create directory if requested */ const mdbx_mode_t dir_mode = @@ -17541,44 +18623,89 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, assert(dxb_name[0] == '/' && lck_name[0] == '/'); const size_t pathname_len = strlen(pathname); #endif - assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/'); + assert(!osal_isdirsep(lock_suffix[0])); ctx->ent_len = pathname_len; static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; - if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len && - path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name, - dxb_name_len)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len; + if (*flags & MDBX_NOSUBDIR) { + if (ctx->ent_len > dxb_name_len && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; + } else if (ctx->ent_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + osal_isdirsep(lck_name[0]) && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len + 1, + dxb_name + 1, dxb_name_len - 1)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len - 1; + } } - const size_t bytes_needed = - sizeof(pathchar_t) * ctx->ent_len * 2 + - ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) - : sizeof(lck_name) + sizeof(dxb_name)); + const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); + const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); + const size_t enogh4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = sizeof(pathchar_t) * ctx->ent_len * 2 + enogh4any; ctx->buffer_for_free = osal_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; ctx->dxb = ctx->buffer_for_free; - ctx->lck = ctx->dxb + ctx->ent_len + 1; - memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); - if (*flags & MDBX_NOSUBDIR) { - memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); - } else { - ctx->lck += dxb_name_len; - memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); - memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); - } - memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); + ctx->lck = ctx->dxb + ctx->ent_len + dxb_name_len + 1; + pathchar_t *const buf = ctx->buffer_for_free; + rc = MDBX_SUCCESS; + if (ctx->ent_len) { + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * pathname_len); + if (*flags & MDBX_NOSUBDIR) { + const pathchar_t *const lck_ext = + osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); + if (lck_ext) { + pathchar_t *pathname_ext = osal_fileext(buf, pathname_len); + memcpy(pathname_ext ? pathname_ext : buf + pathname_len, lck_ext, + sizeof(pathchar_t) * (ARRAY_END(lck_name) - lck_ext)); + rc = check_alternative_lck_absent(buf); + } + } else { + memcpy(buf + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + ctx->ent_len + dxb_name_len, lock_suffix, + sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + } - return MDBX_SUCCESS; + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * (ctx->ent_len + 1)); + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); + if (*flags & MDBX_NOSUBDIR) { + memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + } else { + memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + } + } else { + assert(!(*flags & MDBX_NOSUBDIR)); + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, + dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + } + return rc; } __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_deleteW(pathnameW, mode); + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_deleteW(pathnameW, mode); + osal_free(pathnameW); + } + return rc; } __cold int mdbx_env_deleteW(const wchar_t *pathname, @@ -17667,9 +18794,16 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_openW(env, pathnameW, flags, mode); + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); + } + return rc; } __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, @@ -17701,7 +18835,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ if ((flags & MDBX_WRITEMAP) == 0) { if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; @@ -17734,6 +18868,10 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, env_pathname.ent_len * sizeof(pathchar_t)); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; + env->me_dbxs[FREE_DBI].md_vlen_min = 4; + env->me_dbxs[FREE_DBI].md_vlen_max = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: * @@ -17841,13 +18979,14 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); - env->me_fd4data = env->me_fd4meta = env->me_lazy_fd; + env->me_fd4meta = env->me_lazy_fd; #if defined(_WIN32) || defined(_WIN64) - uint8_t ior_flags = 0; - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { - ior_flags = IOR_OVERLAPPED; - if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) { - /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции + eASSERT(env, env->me_overlapped_fd == 0); + bool ior_direct = false; + if (!(flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { + if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. * * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), @@ -17860,23 +18999,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, * 2) Кроме этого, в Windows запись в заблокированный регион файла * возможно только через тот-же дескриптор. Поэтому изначальный захват * блокировок посредством osal_lck_seize(), захват/освобождение блокировок - * во время пишущих транзакций и запись данных должны выполнять через один - * дескриптор. + * во время пишущих транзакций и запись данных должны выполнятся через + * один дескриптор. * * Таким образом, требуется прочитать волатильный заголовок БД, чтобы * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном * для записи данных, чтобы использовать именно этот дескриптор для * изначального захвата блокировок. */ MDBX_meta header; - if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS && - header.mm_psize >= env->me_os_psize) - ior_flags |= IOR_DIRECT; + uint64_t dxb_filesize; + int err = read_header(env, &header, MDBX_SUCCESS, true); + if ((err == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) || + (err == MDBX_ENODATA && mode && env->me_psize >= env->me_os_psize && + osal_filesize(env->me_lazy_fd, &dxb_filesize) == MDBX_SUCCESS && + dxb_filesize == 0)) + /* Может быть коллизия, если два процесса пытаются одновременно создать + * БД с разным размером страницы, который у одного меньше системной + * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная + * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */ + ior_direct = true; } - rc = - osal_openfile((ior_flags & IOR_DIRECT) ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT - : MDBX_OPEN_DXB_OVERLAPPED, - env, env_pathname.dxb, &env->me_overlapped_fd, 0); + rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, + env, env_pathname.dxb, &env->me_overlapped_fd, 0); if (rc != MDBX_SUCCESS) goto bailout; env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); @@ -17884,7 +19030,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = (int)GetLastError(); goto bailout; } - env->me_fd4data = env->me_overlapped_fd; osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); } #else @@ -17910,31 +19055,36 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } osal_fseek(env->me_lfd, safe_parking_lot_offset); - const MDBX_env_flags_t rigorous_flags = - MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; - const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | - MDBX_LIFORECLAIM | - MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; - eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 && - (env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) { + if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC +#if defined(_WIN32) || defined(_WIN64) + | MDBX_EXCLUSIVE +#endif /* !Windows */ + ))) { rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); + if (MDBX_IS_ERROR(rc)) + goto bailout; if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { if ((flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; - if (env->me_fd4data == env->me_lazy_fd) - env->me_fd4data = env->me_dsync_fd; osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); } } + const MDBX_env_flags_t lazy_flags = + MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC; + const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | + MDBX_NORDAHEAD | MDBX_RDONLY | + MDBX_WRITEMAP; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { + MDBX_env_flags_t snap_flags; + while ((snap_flags = atomic_load32(&lck->mti_envmode, mo_AcquireRelease)) == + MDBX_RDONLY) { if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, - env->me_flags & mode_flags)) { + (snap_flags = (env->me_flags & mode_flags)))) { /* The case: * - let's assume that for some reason the DB file is smaller * than it should be according to the geometry, @@ -17953,26 +19103,66 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (env->me_flags & MDBX_ACCEDE) { /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ - const unsigned diff = - (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; - NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, - env->me_flags ^ diff); + const MDBX_env_flags_t diff = + (snap_flags ^ env->me_flags) & + ((snap_flags & lazy_flags) ? mode_flags + : mode_flags & ~MDBX_WRITEMAP); env->me_flags ^= diff; + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, + env->me_flags ^ diff, env->me_flags); } - if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { - ERROR("%s", "current mode/flags incompatible with requested"); + /* Ранее упущенный не очевидный момент: При работе БД в режимах + * не-синхронной/отложенной фиксации на диске, все процессы-писатели должны + * иметь одинаковый режим MDBX_WRITEMAP. + * + * В противном случае, сброс на диск следует выполнять дважды: сначала + * msync(), затем fdatasync(). При этом msync() не обязан отрабатывать + * в процессах без MDBX_WRITEMAP, так как файл в память отображен только + * для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не + * позволяют выполнить фиксацию данных на диск, после их изменения в другом + * процессе. + * + * В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP + * также не следует, поскольку никакой процесс (в том числе последний) не + * может гарантированно сбросить данные на диск, а следовательно не должен + * помечать какую-либо транзакцию как steady. + * + * В результате, требуется либо запретить совместную работу процессам с + * разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое + * смешивание и блокировать steady-пометки - что контрпродуктивно. */ + const MDBX_env_flags_t rigorous_flags = + (snap_flags & lazy_flags) + ? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP + : MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC; + const MDBX_env_flags_t rigorous_diff = + (snap_flags ^ env->me_flags) & rigorous_flags; + if (rigorous_diff) { + ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " + "rigorous diff 0x%X", + env->me_flags, snap_flags, rigorous_diff); rc = MDBX_INCOMPATIBLE; goto bailout; } } + mincore_clean_cache(env); const int dxb_rc = setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; } + rc = osal_check_fs_incore(env->me_lazy_fd); + env->me_incore = false; + if (rc == MDBX_RESULT_TRUE) { + env->me_incore = true; + NOTICE("%s", "in-core database"); + } else if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("check_fs_incore(), err %d", rc); + goto bailout; + } + if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (flags & MDBX_EXCLUSIVE) == 0)) { @@ -17982,11 +19172,14 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } DEBUG("opened dbenv %p", (void *)env); + if (!lck || lck_rc == MDBX_RESULT_TRUE) { + env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; + env->me_lck->mti_meta_sync_txnid.weak = + (uint32_t)recent_committed_txnid(env); + env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); + } if (lck) { if (lck_rc == MDBX_RESULT_TRUE) { - lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); - lck->mti_reader_check_timestamp.weak = osal_monotime(); rc = osal_lck_downgrade(env); DEBUG("lck-downgrade-%s: rc %i", (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); @@ -18005,28 +19198,27 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, goto bailout; env->me_flags |= MDBX_ENV_TXKEY; } - } else { - env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); } if ((flags & MDBX_RDONLY) == 0) { - const size_t tsize = sizeof(MDBX_txn), + const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + sizeof(MDBX_atomic_uint32_t) + 1); rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { - memset(env->me_pbuf, -1, env->me_psize * 2); + memset(env->me_pbuf, -1, env->me_psize * (size_t)2); + memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, + env->me_psize); MDBX_txn *txn = osal_calloc(1, size); if (txn) { - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbs = ptr_disp(txn, tsize); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); txn->mt_dbiseqs = - (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); + ptr_disp(txn->mt_cursors, sizeof(MDBX_cursor *) * env->me_maxdbs); + txn->mt_dbistate = ptr_disp( + txn->mt_dbiseqs, sizeof(MDBX_atomic_uint32_t) * env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; @@ -18039,11 +19231,14 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = MDBX_ENOMEM; } if (rc == MDBX_SUCCESS) - rc = osal_ioring_create(&env->me_ioring, + rc = osal_ioring_create(&env->me_ioring #if defined(_WIN32) || defined(_WIN64) - ior_flags, + , + ior_direct, env->me_overlapped_fd #endif /* Windows */ - env->me_fd4data); + ); + if (rc == MDBX_SUCCESS) + adjust_defaults(env); } #if MDBX_DEBUG @@ -18096,7 +19291,8 @@ __cold static int env_close(MDBX_env *env) { } munlock_all(env); - osal_ioring_destroy(&env->me_ioring); + if (!(env->me_flags & MDBX_RDONLY)) + osal_ioring_destroy(&env->me_ioring); lcklist_lock(); const int rc = lcklist_detach_locked(env); @@ -18115,10 +19311,11 @@ __cold static int env_close(MDBX_env *env) { } #if defined(_WIN32) || defined(_WIN64) - if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) { + eASSERT(env, !env->me_overlapped_fd || + env->me_overlapped_fd == INVALID_HANDLE_VALUE); + if (env->me_data_lock_event != INVALID_HANDLE_VALUE) { CloseHandle(env->me_data_lock_event); - CloseHandle(env->me_overlapped_fd); - env->me_overlapped_fd = INVALID_HANDLE_VALUE; + env->me_data_lock_event = INVALID_HANDLE_VALUE; } #endif /* Windows */ @@ -18138,9 +19335,11 @@ __cold static int env_close(MDBX_env *env) { } if (env->me_dbxs) { - for (size_t i = env->me_numdbs; --i >= CORE_DBS;) - osal_free(env->me_dbxs[i].md_name.iov_base); + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) + if (env->me_dbxs[i].md_name.iov_len) + osal_free(env->me_dbxs[i].md_name.iov_base); osal_free(env->me_dbxs); + env->me_numdbs = CORE_DBS; env->me_dbxs = nullptr; } if (env->me_pbuf) { @@ -18159,11 +19358,17 @@ __cold static int env_close(MDBX_env *env) { osal_free(env->me_pathname); env->me_pathname = nullptr; } +#if defined(_WIN32) || defined(_WIN64) + if (env->me_pathname_char) { + osal_free(env->me_pathname_char); + env->me_pathname_char = nullptr; + } +#endif /* Windows */ if (env->me_txn0) { dpl_free(env->me_txn0); txl_free(env->me_txn0->tw.lifo_reclaimed); pnl_free(env->me_txn0->tw.retired_pages); - pnl_free(env->me_txn0->tw.spill_pages); + pnl_free(env->me_txn0->tw.spilled.list); pnl_free(env->me_txn0->tw.relist); osal_free(env->me_txn0); env->me_txn0 = nullptr; @@ -18240,9 +19445,10 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { while ((dp = env->me_dp_reserve) != NULL) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; - osal_free(dp); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); ENSURE(env, env->me_lcklist_next == nullptr); @@ -18252,105 +19458,6 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { return rc; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_close(MDBX_env *env) { - return __inline_mdbx_env_close(env); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - -/* Compare two items pointing at aligned unsigned int's. */ -__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(4, a->iov_base), - unaligned_peek_u32(4, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(4, a->iov_base), - unaligned_peek_u64(4, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items pointing at 2-byte aligned unsigned int's. */ -__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(2, a->iov_base), - unaligned_peek_u32(2, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(2, a->iov_base), - unaligned_peek_u64(2, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items pointing at unsigned values with unknown alignment. - * - * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(1, a->iov_base), - unaligned_peek_u32(1, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(1, a->iov_base), - unaligned_peek_u64(1, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items lexically */ -__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { - if (a->iov_len == b->iov_len) - return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; - - const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; - return likely(diff_data) ? diff_data : diff_len; -} - -/* Compare two items in reverse byte order */ -__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - if (likely(shortest)) { - const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; - const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; - const uint8_t *const end = pa - shortest; - do { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } while (pa != end); - } - return CMP2INT(a->iov_len, b->iov_len); -} - -/* Fast non-lexically comparator */ -__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { - int diff = CMP2INT(a->iov_len, b->iov_len); - return likely(diff) || a->iov_len == 0 - ? diff - : memcmp(a->iov_base, b->iov_base, a->iov_len); -} - -static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, - const MDBX_val *b) { - /* checking for the use of a known good comparator - * or/otherwise for a full byte-to-byte match */ - return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse || - cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0; -} - /* Search for key within a page, using binary search. * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. @@ -18385,8 +19492,8 @@ __hot static struct node_result node_search(MDBX_cursor *mc, do { i = (low + high) >> 1; nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); int cr = cmp(key, &nodekey); DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); if (cr > 0) @@ -18420,8 +19527,8 @@ __hot static struct node_result node_search(MDBX_cursor *mc, node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); - cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); @@ -18534,10 +19641,9 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, return MDBX_SUCCESS; } -__cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, - MDBX_page *page, - MDBX_cursor *const mc, - const txnid_t front) { +__cold static __noinline pgr_t +page_get_checker_full(const uint16_t ILL, MDBX_page *page, + const MDBX_cursor *const mc, const txnid_t front) { pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; if (likely(r.err == MDBX_SUCCESS)) r.err = page_check(mc, page); @@ -18547,7 +19653,7 @@ __cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, } __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, - MDBX_cursor *const mc, + const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { MDBX_txn *const txn = mc->mc_txn; @@ -18580,7 +19686,6 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, const size_t i = dpl_search(spiller, pgno); tASSERT(txn, (intptr_t)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { - spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; r.page = spiller->tw.dirtylist->items[i].ptr; break; } @@ -18645,7 +19750,7 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, } else { const struct node_result nsr = node_search(mc, key); if (likely(nsr.node)) - i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; + i = mc->mc_ki[mc->mc_top] + (intptr_t)nsr.exact - 1; else i = page_numkeys(mp) - 1; DEBUG("following index %zu for key [%s]", i, DKEY_DEBUG(key)); @@ -18664,7 +19769,8 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, ready: if (flags & MDBX_PS_MODIFY) { - if (unlikely((rc = page_touch(mc)) != 0)) + rc = page_touch(mc); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mp = mc->mc_pg[mc->mc_top]; } @@ -18849,7 +19955,8 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { } cASSERT(mc, root >= NUM_METAS); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED) || + mc->mc_pg[0]->mp_pgno != root) { txnid_t pp_txnid = mc->mc_db->md_mod_txnid; pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid ? pp_txnid @@ -18877,8 +19984,6 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { mc->mc_pg[0]->mp_flags); if (flags & MDBX_PS_MODIFY) { - if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc))) - return rc; if (unlikely(rc = page_touch(mc))) return rc; } @@ -18906,18 +20011,11 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, if (!MDBX_DISABLE_VALIDATION) { const MDBX_env *env = mc->mc_txn->mt_env; const size_t dsize = data->iov_len; - if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) - poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); const unsigned npages = number_of_ovpages(env, dsize); - if (unlikely(lp.page->mp_pages != npages)) { - if (lp.page->mp_pages < npages) - return bad_page(lp.page, - "too less n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); - else - poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); - } + if (unlikely(lp.page->mp_pages < npages)) + return bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } return MDBX_SUCCESS; } @@ -18974,7 +20072,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); + return cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, @@ -19046,9 +20144,9 @@ static int cursor_sibling(MDBX_cursor *mc, int dir) { DEBUG("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - if ((dir == SIBLING_RIGHT) - ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { + if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + (size_t)1 >= + page_numkeys(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { DEBUG("no more keys aside, moving to next %s sibling", dir ? "right" : "left"); if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { @@ -19097,7 +20195,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { - if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) + if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } @@ -19412,7 +20510,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { return ret; } } else { - mc->mc_pg[0] = 0; + mc->mc_pg[0] = nullptr; } ret.err = page_search(mc, &aligned_key, 0); @@ -19420,6 +20518,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { return ret; mp = mc->mc_pg[mc->mc_top]; + MDBX_ANALYSIS_ASSUME(mp != nullptr); cASSERT(mc, IS_LEAF(mp)); search_node:; @@ -19477,10 +20576,12 @@ got_node: if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } else { + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -19588,6 +20689,7 @@ static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; @@ -19637,6 +20739,7 @@ static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; @@ -19650,20 +20753,11 @@ static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; } -__hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; +static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + int (*mfunc)(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data); + int rc; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -19702,8 +20796,8 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(rc)) return rc; } else { - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); + rc = cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); if (unlikely(rc)) return rc; } @@ -19809,8 +20903,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; - } - { + } else { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_DUPDATA)) { get_key_optional(node, key); @@ -19880,6 +20973,22 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return cursor_get(mc, key, data, op); +} + static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { int err = page_search(mc, NULL, MDBX_PS_FIRST); @@ -20010,7 +21119,6 @@ static int touch_dbi(MDBX_cursor *mc) { *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_UPDATE_GC) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); @@ -20024,153 +21132,74 @@ static int touch_dbi(MDBX_cursor *mc) { return MDBX_SUCCESS; } -/* Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * [in] mc The cursor to operate on. */ -static int cursor_touch(MDBX_cursor *mc) { - int rc = MDBX_SUCCESS; - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - rc = touch_dbi(mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data) { + cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0); + cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); + cASSERT(mc, cursor_is_tracked(mc)); + + if ((mc->mc_flags & C_SUB) == 0) { + MDBX_txn *const txn = mc->mc_txn; + txn_lru_turn(txn); + + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + int err = touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + size_t need = CURSOR_STACK + 3; + /* 2) GC/FreeDB for any payload */ + if (mc->mc_dbi > FREE_DBI) { + need += txn->mt_dbs[FREE_DBI].md_depth + (size_t)3; + /* 3) Named DBs also dirty the main DB */ + if (mc->mc_dbi > MAIN_DBI) + need += txn->mt_dbs[MAIN_DBI].md_depth + (size_t)3; + } +#if xMDBX_DEBUG_SPILLING != 2 + /* production mode */ + /* 4) Double the page chain estimation + * for extensively splitting, rebalance and merging */ + need += need; + /* 5) Factor the key+data which to be put in */ + need += bytes2pgno(txn->mt_env, node_size(key, data)) + (size_t)1; +#else + /* debug mode */ + (void)key; + (void)data; + txn->mt_env->debug_dirtied_est = ++need; + txn->mt_env->debug_dirtied_act = 0; +#endif /* xMDBX_DEBUG_SPILLING == 2 */ + + int err = txn_spill(txn, mc, need); + if (unlikely(err != MDBX_SUCCESS)) + return err; } + + int rc = MDBX_SUCCESS; if (likely(mc->mc_snum)) { mc->mc_top = 0; do { rc = page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); + if (unlikely(rc != MDBX_SUCCESS)) + break; + mc->mc_top += 1; + } while (mc->mc_top < mc->mc_snum); mc->mc_top = mc->mc_snum - 1; } return rc; } -__hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - unsigned flags) { - MDBX_env *env; - MDBX_page *sub_root = NULL; +static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + MDBX_page *sub_root = nullptr; MDBX_val xdata, *rdata, dkey, olddata; MDBX_db nested_dupdb; int err; DKBUF_DEBUG; - - if (unlikely(mc == NULL || key == NULL || data == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; - - cASSERT(mc, cursor_is_tracked(mc)); - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any early failures. */ - size_t mcount = 0, dcount = 0; - if (unlikely(flags & MDBX_MULTIPLE)) { - if (unlikely(flags & MDBX_RESERVE)) - return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - dcount = data[1].iov_len; - if (unlikely(dcount < 2 || data->iov_len == 0)) - return MDBX_BAD_VALSIZE; - if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) - return MDBX_BAD_VALSIZE; - if (unlikely(dcount > MAX_MAPSIZE / 2 / - (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { - /* checking for multiplication overflow */ - if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) - return MDBX_TOO_LARGE; - } - data[1].iov_len = 0 /* reset done item counter */; - } - - if (flags & MDBX_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | - MDBX_INTEGERDUP | MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - data->iov_base = nullptr; - } - - const unsigned nospill = flags & MDBX_NOSPILL; - flags -= nospill; - - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; - - uint64_t aligned_keybytes, aligned_databytes; - MDBX_val aligned_key, aligned_data; - if (likely((mc->mc_flags & C_SUB) == 0)) { - if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || - key->iov_len > mc->mc_dbx->md_klen_max)) { - cASSERT(mc, !"Invalid key-size"); - return MDBX_BAD_VALSIZE; - } - if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || - data->iov_len > mc->mc_dbx->md_vlen_max)) { - cASSERT(mc, !"Invalid data-size"); - return MDBX_BAD_VALSIZE; - } - - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - switch (key->iov_len) { - default: - cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); - key = &aligned_key; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); - key = &aligned_key; - } - break; - } - } - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { - switch (data->iov_len) { - default: - cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 4); - data = &aligned_data; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 8); - data = &aligned_data; - } - break; - } - } - } - + MDBX_env *const env = mc->mc_txn->mt_env; DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, DDBI(mc), DKEY_DEBUG(key), key->iov_len, DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); @@ -20182,12 +21211,12 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. - * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц + * Здесь проще вызвать cursor_get(), так как для обслуживания таблиц * с MDBX_DUPSORT также требуется текущий размер данных. */ MDBX_val current_key, current_data; - rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(err != MDBX_SUCCESS)) + return err; if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) return MDBX_EKEYMISMATCH; @@ -20205,16 +21234,16 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (mc->mc_xcursor->mx_db.md_entries > 1 || current_data.iov_len != data->iov_len) { drop_current: - rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, flags & MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { - rc = mdbx_cursor_del(mc, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } @@ -20225,6 +21254,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, skip_check_samedata:; } + int rc = MDBX_SUCCESS; if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; @@ -20236,15 +21266,15 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { rc = cursor_last(mc, &dkey, &olddata); if (likely(rc == MDBX_SUCCESS)) { - rc = mc->mc_dbx->md_cmp(key, &dkey); - if (likely(rc > 0)) { + const int cmp = mc->mc_dbx->md_cmp(key, &dkey); + if (likely(cmp > 0)) { mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; + } else if (unlikely(cmp != 0)) { + /* new-key < last-key */ + return MDBX_EKEYMISMATCH; } else { - if (unlikely(rc != MDBX_SUCCESS)) - /* new-key < last-key - * or new-key == last-key without MDBX_APPENDDUP */ - return MDBX_EKEYMISMATCH; + rc = MDBX_SUCCESS; exact = true; } } @@ -20274,57 +21304,50 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = mdbx_cursor_del(mc, MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_ALLDUPS; - rc = MDBX_NOTFOUND; + rc = mc->mc_snum ? MDBX_NOTFOUND : MDBX_NO_ROOT; exact = false; - } else /* checking for early exit without dirtying pages */ - if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) && - unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) { - if (!mc->mc_xcursor) - /* the same data, nothing to update */ - return MDBX_SUCCESS; - if (flags & MDBX_NODUPDATA) - return MDBX_KEYEXIST; - if (flags & MDBX_APPENDDUP) - return MDBX_EKEYMISMATCH; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) - /* data is match exactly byte-to-byte, nothing to update */ - return MDBX_SUCCESS; - else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ + } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { + /* checking for early exit without dirtying pages */ + if (unlikely(eq_fast(data, &olddata))) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + if (mc->mc_xcursor) { + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (flags & MDBX_APPENDDUP) + return MDBX_EKEYMISMATCH; } + /* the same data, nothing to update */ + return MDBX_SUCCESS; } + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) != 0); + } } } else if (unlikely(rc != MDBX_NOTFOUND)) return rc; } mc->mc_flags &= ~C_DEL; + rdata = data; + size_t mcount = 0, dcount = 0; + if (unlikely(flags & MDBX_MULTIPLE)) { + dcount = data[1].iov_len; + data[1].iov_len = 0 /* reset done item counter */; + rdata = &xdata; + xdata.iov_len = data->iov_len * dcount; + } /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - rdata = data; - if (unlikely(flags & MDBX_MULTIPLE)) { - rdata = &xdata; - xdata.iov_len = data->iov_len * dcount; - } - if (unlikely(err = cursor_spill(mc, key, rdata))) - return err; - } + err = cursor_touch(mc, key, rdata); + if (unlikely(err)) + return err; if (unlikely(rc == MDBX_NO_ROOT)) { /* new database, write a root leaf page */ DEBUG("%s", "allocating new root leaf page"); - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - err = touch_dbi(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; @@ -20351,11 +21374,6 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) npr.page->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - err = cursor_touch(mc); - if (unlikely(err)) - return err; } bool insert_key, insert_data, do_sub = false; @@ -20379,11 +21397,11 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; size_t ksize = mc->mc_db->md_xsize; if (unlikely(key->iov_len != ksize)) return MDBX_BAD_VALSIZE; - ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + void *ptr = + page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to @@ -20423,9 +21441,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Large/Overflow page overwrites need special handling */ if (unlikely(node_flags(node) & F_BIGDATA)) { - int dpages = (node_size(key, data) > env->me_leaf_nodemax) - ? number_of_ovpages(env, data->iov_len) - : 0; + const size_t dpages = (node_size(key, data) > env->me_leaf_nodemax) + ? number_of_ovpages(env, data->iov_len) + : 0; const pgno_t pgno = node_largedata_pgno(node); pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); @@ -20434,13 +21452,13 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ - int ovpages = lp.page->mp_pages; - if (!IS_FROZEN(mc->mc_txn, lp.page) && - (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_FROZEN_RE) - ? (ovpages >= dpages) - : (ovpages == - /* LY: add configurable threshold to keep reserve space */ - dpages))) { + const size_t ovpages = lp.page->mp_pages; + const size_t extra_threshold = + (mc->mc_dbi == FREE_DBI) + ? 1 + : /* LY: add configurable threshold to keep reserve space */ 0; + if (!IS_FROZEN(mc->mc_txn, lp.page) && ovpages >= dpages && + ovpages <= dpages + extra_threshold) { /* yes, overwrite it. */ if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { if (IS_SPILLED(mc->mc_txn, lp.page)) { @@ -20495,8 +21513,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { olddata.iov_len = node_ds(node); olddata.iov_base = node_data(node); - cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <= - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(olddata.iov_base, olddata.iov_len) <= + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); /* DB has dups? */ if (mc->mc_db->md_flags & MDBX_DUPSORT) { @@ -20511,28 +21529,21 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Was a single item before, must convert now */ if (!(node_flags(node) & F_DUPDATA)) { - /* does data match? */ - const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); - if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0)) - return MDBX_EKEYMISMATCH; - if (cmp == 0) { + if (flags & MDBX_APPENDDUP) { + const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); + cASSERT(mc, cmp != 0 || eq_fast(data, &olddata)); + if (unlikely(cmp <= 0)) + return MDBX_EKEYMISMATCH; + } else if (eq_fast(data, &olddata)) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) { - /* data is match exactly byte-to-byte, nothing to update */ - if (unlikely(flags & MDBX_MULTIPLE)) { - rc = MDBX_SUCCESS; - goto continue_multiple; - } - return MDBX_SUCCESS; - } else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ - } - cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); - goto current; + /* data is match exactly byte-to-byte, nothing to update */ + rc = MDBX_SUCCESS; + if (likely((flags & MDBX_MULTIPLE) == 0)) + return rc; + goto continue_multiple; } /* Just overwrite the current item */ @@ -20636,10 +21647,10 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy(page_data(mp), page_data(fp), page_numkeys(fp) * fp->mp_leaf2_ksize); } else { - memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, - (char *)fp + fp->mp_upper + PAGEHDRSZ, + memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), + ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), olddata.iov_len - fp->mp_upper - PAGEHDRSZ); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), + memcpy(mp->mp_ptrs, fp->mp_ptrs, page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); for (i = 0; i < page_numkeys(fp); i++) { cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); @@ -20678,8 +21689,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, cASSERT(mc, key->iov_len < UINT16_MAX); node_set_ks(node, key->iov_len); memcpy(node_key(node), key->iov_base, key->iov_len); - cASSERT(mc, (char *)node_key(node) + node_ds(node) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(node_key(node), node_ds(node)) < + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); goto fix_parent; } @@ -20748,9 +21759,8 @@ new_sub:; STATIC_ASSERT( (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == MDBX_NOOVERWRITE); - xflags = MDBX_CURRENT | MDBX_NOSPILL | - ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + xflags = MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); @@ -20761,7 +21771,8 @@ new_sub:; mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; /* converted, write the original data first */ if (dupdata_flag) { - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, + xflags); if (unlikely(rc)) goto bad_sub; /* we've done our job */ @@ -20797,7 +21808,8 @@ new_sub:; STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == MDBX_APPEND); xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &xdata, + xflags); if (flags & F_SUBDATA) { void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; @@ -20823,7 +21835,7 @@ new_sub:; /* let caller know how many succeeded, if any */ data[1].iov_len = mcount; if (mcount < dcount) { - data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; + data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); insert_key = insert_data = false; goto more; } @@ -20843,7 +21855,127 @@ new_sub:; return rc; } -__hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { +static __hot int cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + cASSERT(mc, (mc->mc_flags & C_SUB) == 0); + uint64_t aligned_keybytes, aligned_databytes; + MDBX_val aligned_key, aligned_data; + if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || + key->iov_len > mc->mc_dbx->md_klen_max)) { + cASSERT(mc, !"Invalid key-size"); + return MDBX_BAD_VALSIZE; + } + if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || + data->iov_len > mc->mc_dbx->md_vlen_max)) { + cASSERT(mc, !"Invalid data-size"); + return MDBX_BAD_VALSIZE; + } + + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + switch (key->iov_len) { + default: + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); + key = &aligned_key; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); + key = &aligned_key; + } + break; + } + } + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { + switch (data->iov_len) { + default: + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 4); + data = &aligned_data; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 8); + data = &aligned_data; + } + break; + } + } + return cursor_put_nochecklen(mc, key, data, flags); +} + +int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + MDBX_put_flags_t flags) { + if (unlikely(mc == NULL || key == NULL || data == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + + cASSERT(mc, cursor_is_tracked(mc)); + + /* Check this first so counter will always be zero on any early failures. */ + if (unlikely(flags & MDBX_MULTIPLE)) { + if (unlikely(flags & MDBX_RESERVE)) + return MDBX_EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + const size_t dcount = data[1].iov_len; + if (unlikely(dcount < 2 || data->iov_len == 0)) + return MDBX_BAD_VALSIZE; + if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) + return MDBX_BAD_VALSIZE; + if (unlikely(dcount > MAX_MAPSIZE / 2 / + (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { + /* checking for multiplication overflow */ + if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) + return MDBX_TOO_LARGE; + } + } + + if (flags & MDBX_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_INTEGERDUP | MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + data->iov_base = nullptr; + } + + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; + + return cursor_put_checklen(mc, key, data, flags); +} + +int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -20864,11 +21996,14 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) return MDBX_NOTFOUND; - if (likely((flags & MDBX_NOSPILL) == 0) && - unlikely(rc = cursor_spill(mc, NULL, NULL))) - return rc; + return cursor_del(mc, flags); +} - rc = cursor_touch(mc); +static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { + cASSERT(mc, mc->mc_flags & C_INITIALIZED); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])); + + int rc = cursor_touch(mc, nullptr, nullptr); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20884,30 +22019,29 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* cursor_del() will subtract the final entry */ + /* will subtract the final entry later */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); + rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { if (node_flags(node) & F_SUBDATA) { /* update subDB info */ - void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); + memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - MDBX_cursor *m2; /* shrink fake page */ node_shrink(mp, mc->mc_ki[mc->mc_top]); node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -20954,7 +22088,109 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } del_key: - return cursor_del(mc); + mc->mc_db->md_entries--; + const MDBX_dbi dbi = mc->mc_dbi; + indx_t ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mp)); + node_del(mc, mc->mc_db->md_xsize); + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + + rc = rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + if (unlikely(!mc->mc_snum)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + size_t nkeys = page_numkeys(mp); + cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + node = page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + m3->mc_flags |= C_DEL; + } + } + } + + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); + return rc; fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -20971,7 +22207,6 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; - ret.page->mp_txnid = mc->mc_txn->mt_front; cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT @@ -20993,25 +22228,24 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { return ret; } -static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { pgr_t ret = likely(npages == 1) ? page_alloc(mc) - : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_DEFAULT); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, + DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; - ret.page->mp_txnid = mc->mc_txn->mt_front; cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - mc->mc_db->md_overflow_pages += npages; - ret.page->mp_pages = npages; + mc->mc_db->md_overflow_pages += (pgno_t)npages; + ret.page->mp_pages = (pgno_t)npages; cASSERT(mc, !(mc->mc_flags & C_SUB)); return ret; } @@ -21020,6 +22254,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_ANALYSIS_ASSUME(key != nullptr); DKBUF_DEBUG; DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " " key size %" PRIuPTR " [%s]", @@ -21042,12 +22277,12 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, mp->mp_lower = (indx_t)lower; mp->mp_upper = (indx_t)upper; - char *const ptr = page_leaf2key(mp, indx, ksize); + void *const ptr = page_leaf2key(mp, indx, ksize); cASSERT(mc, nkeys >= indx); const size_t diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ - memmove(ptr + ksize, ptr, diff * ksize); + memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); return MDBX_SUCCESS; @@ -21100,6 +22335,8 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, const MDBX_val *key, MDBX_val *data, unsigned flags) { + MDBX_ANALYSIS_ASSUME(key != nullptr); + MDBX_ANALYSIS_ASSUME(data != nullptr); MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR @@ -21108,7 +22345,6 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, key ? key->iov_len : 0, DKEY_DEBUG(key)); cASSERT(mc, key != NULL && data != NULL); cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); - cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); MDBX_page *largepage = NULL; size_t node_bytes; @@ -21117,6 +22353,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, page_room(mp) >= node_bytes); } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { /* Put data on large/overflow page. */ @@ -21130,6 +22367,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, flags); return MDBX_PROBLEM; } + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); const pgr_t npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) @@ -21141,10 +22379,12 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, flags |= F_BIGDATA; node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } else { + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); node_bytes = node_size(key, data) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } - cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ const size_t nkeys = page_numkeys(mp); @@ -21204,9 +22444,9 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { if (IS_LEAF2(mp)) { cASSERT(mc, ksize >= sizeof(indx_t)); size_t diff = nkeys - 1 - hole; - char *base = page_leaf2key(mp, hole, ksize); + void *const base = page_leaf2key(mp, hole, ksize); if (diff) - memmove(base, base + ksize, diff * ksize); + memmove(base, ptr_disp(base, ksize), diff * ksize); cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); @@ -21230,8 +22470,8 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { ? mp->mp_ptrs[r] + (indx_t)hole_size : mp->mp_ptrs[r]; - char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + hole_size, base, hole_offset - mp->mp_upper); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, hole_size), base, hole_offset - mp->mp_upper); cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); @@ -21253,7 +22493,6 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { static void node_shrink(MDBX_page *mp, size_t indx) { MDBX_node *node; MDBX_page *sp, *xp; - char *base; size_t nsize, delta, len, ptr; intptr_t i; @@ -21271,7 +22510,7 @@ static void node_shrink(MDBX_page *mp, size_t indx) { assert(nsize % 1 == 0); len = nsize; } else { - xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ + xp = ptr_disp(sp, delta); /* destination subpage */ for (i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); @@ -21284,8 +22523,8 @@ static void node_shrink(MDBX_page *mp, size_t indx) { node_set_ds(node, nsize); /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + delta, base, (char *)sp + len - base); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); ptr = mp->mp_ptrs[indx]; for (i = page_numkeys(mp); --i >= 0;) { @@ -21779,7 +23018,6 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp; MDBX_node *node; - char *base; size_t len; ptrdiff_t delta, ksize, oksize; intptr_t ptr, i, nkeys, indx; @@ -21824,9 +23062,9 @@ static int update_key(MDBX_cursor *mc, const MDBX_val *key) { } } - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); + memmove(ptr_disp(base, -delta), base, len); cASSERT(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; @@ -22189,7 +23427,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) return rc; - key.iov_base = (char *)key.iov_base + key.iov_len; + key.iov_base = ptr_disp(key.iov_base, key.iov_len); } while (++i != src_nkeys); } else { MDBX_node *srcnode = page_node(psrc, 0); @@ -22299,8 +23537,6 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } } - /* If not operating on GC, allow this page to be reused - * in this txn. Otherwise just add to free list. */ rc = page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; @@ -22438,7 +23674,7 @@ static int rebalance(MDBX_cursor *mc) { const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (pagetype & P_BRANCH) + 1; + const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; /* Pages emptier than this are candidates for merging. */ size_t room_threshold = likely(mc->mc_dbi != FREE_DBI) @@ -22453,6 +23689,7 @@ static int rebalance(MDBX_cursor *mc) { (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), room); + cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, tp)); if (unlikely(numkeys < minkeys)) { DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", @@ -22541,8 +23778,9 @@ static int rebalance(MDBX_cursor *mc) { IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); rc = page_retire(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (likely(rc == MDBX_SUCCESS)) + rc = page_touch(mc); + return rc; } else { DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", mp->mp_pgno, mp->mp_flags); @@ -22574,9 +23812,10 @@ static int rebalance(MDBX_cursor *mc) { return rc; cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { + if (mn.mc_ki[pre_top] + (size_t)1 < page_numkeys(mn.mc_pg[pre_top])) { rc = page_get( - &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), + &mn, + node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + (size_t)1)), &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -22592,8 +23831,10 @@ static int rebalance(MDBX_cursor *mc) { const size_t right_room = right ? page_room(right) : 0; const size_t left_nkeys = left ? page_numkeys(left) : 0; const size_t right_nkeys = right ? page_numkeys(right) : 0; + bool involve = false; retry: - if (left_room > room_threshold && left_room >= right_room) { + if (left_room > room_threshold && left_room >= right_room && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try merge with left */ cASSERT(mc, left_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = left; @@ -22611,7 +23852,8 @@ retry: return rc; } } - if (right_room > room_threshold) { + if (right_room > room_threshold && + (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try merge with right */ cASSERT(mc, right_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = right; @@ -22627,7 +23869,8 @@ retry: } if (left_nkeys > minkeys && - (right_nkeys <= left_nkeys || right_room >= left_room)) { + (right_nkeys <= left_nkeys || right_room >= left_room) && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try move from left */ mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); @@ -22640,7 +23883,7 @@ retry: return rc; } } - if (right_nkeys > minkeys) { + if (right_nkeys > minkeys && (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try move from right */ mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); @@ -22661,6 +23904,10 @@ retry: return MDBX_SUCCESS; } + if (likely(!involve)) { + involve = true; + goto retry; + } if (likely(room_threshold > 0)) { room_threshold = 0; goto retry; @@ -22673,14 +23920,15 @@ retry: return MDBX_PROBLEM; } -__cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { +__cold static int page_check(const MDBX_cursor *const mc, + const MDBX_page *const mp) { DKBUF; int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); MDBX_env *const env = mc->mc_txn->mt_env; - const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb; + const ptrdiff_t offset = ptr_dist(mp, env->me_map); unsigned flags_mask = P_ILL_BITS; unsigned flags_expected = 0; if (offset < 0 || @@ -22752,7 +24000,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->mp_lower, mp->mp_upper, page_space(env)); - char *const end_of_page = (char *)mp + env->me_psize; + const char *const end_of_page = ptr_disp(mp, env->me_psize); const size_t nkeys = page_numkeys(mp); STATIC_ASSERT(P_BRANCH == 1); if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { @@ -22783,7 +24031,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { MDBX_val here, prev = {0, 0}; for (size_t i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { - char *const key = page_leaf2key(mp, i, leaf2_ksize); + const char *const key = page_leaf2key(mp, i, leaf2_ksize); if (unlikely(end_of_page < key + leaf2_ksize)) { rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", key + leaf2_ksize - end_of_page); @@ -22800,8 +24048,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; } if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_base = (void *)key; here.iov_len = leaf2_ksize; - here.iov_base = key; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); @@ -22809,7 +24057,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } } else { const MDBX_node *const node = page_node(mp, i); - const char *node_end = (char *)node + NODESIZE; + const char *const node_end = ptr_disp(node, NODESIZE); if (unlikely(node_end > end_of_page)) { rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, node_end - end_of_page); @@ -22818,7 +24066,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { const size_t ksize = node_ks(node); if (unlikely(ksize > ksize_max)) rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); - char *key = node_key(node); + const char *const key = node_key(node); if (unlikely(end_of_page < key + ksize)) { rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); @@ -22831,7 +24079,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { - here.iov_base = key; + here.iov_base = (void *)key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, @@ -22885,7 +24133,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if (unlikely(node_size_len(node_ks(node), dsize) <= - mc->mc_txn->mt_env->me_leaf_nodemax)) + mc->mc_txn->mt_env->me_leaf_nodemax) && + mc->mc_dbi != FREE_DBI) poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); if ((mc->mc_checking & CC_RETIRING) == 0) { @@ -22900,7 +24149,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(lp.page, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); - else + else if (mc->mc_dbi != FREE_DBI) poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); @@ -22970,8 +24219,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { for (int j = 0; j < nsubkeys; j++) { if (IS_LEAF2(sp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - size_t sub_ksize = sp->mp_leaf2_ksize; - char *sub_key = page_leaf2key(sp, j, sub_ksize); + const size_t sub_ksize = sp->mp_leaf2_ksize; + const char *const sub_key = page_leaf2key(sp, j, sub_ksize); if (unlikely(end_of_subpage < sub_key + sub_ksize)) { rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", sub_key + sub_ksize - end_of_subpage); @@ -22990,8 +24239,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; } if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; if (sub_prev.iov_base && unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, @@ -23001,7 +24250,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } } else { const MDBX_node *const sub_node = page_node(sp, j); - const char *sub_node_end = (char *)sub_node + NODESIZE; + const char *const sub_node_end = ptr_disp(sub_node, NODESIZE); if (unlikely(sub_node_end > end_of_subpage)) { rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", end_of_subpage - sub_node_end); @@ -23011,9 +24260,9 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(mp, "nested-node invalid flags (%u)\n", node_flags(sub_node)); - size_t sub_ksize = node_ks(sub_node); - char *sub_key = node_key(sub_node); - size_t sub_dsize = node_ds(sub_node); + const size_t sub_ksize = node_ks(sub_node); + const char *const sub_key = node_key(sub_node); + const size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || @@ -23024,8 +24273,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { sub_ksize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; if (sub_prev.iov_base && unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, @@ -23049,7 +24298,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { return rc; } -__cold static int cursor_check(MDBX_cursor *mc) { +__cold static int cursor_check(const MDBX_cursor *mc) { if (!mc->mc_txn->tw.dirtylist) { cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); @@ -23133,124 +24382,6 @@ __cold static int cursor_check_updating(MDBX_cursor *mc) { return rc; } -/* Complete a delete operation started by mdbx_cursor_del(). */ -static int cursor_del(MDBX_cursor *mc) { - int rc; - MDBX_page *mp; - indx_t ki; - size_t nkeys; - MDBX_dbi dbi = mc->mc_dbi; - - cASSERT(mc, cursor_is_tracked(mc)); - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node_del(mc, mc->mc_db->md_xsize); - mc->mc_db->md_entries--; - - /* Adjust other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - - rc = rebalance(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - if (unlikely(!mc->mc_snum)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by rebalance and aren't needed here. */ - cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_EOF; - return MDBX_SUCCESS; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); - - /* Adjust this and other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = cursor_sibling(m3, SIBLING_RIGHT); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - if (m3->mc_ki[mc->mc_top] >= ki || - /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { - if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not inited it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - } - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - m3->mc_flags |= C_DEL; - } - } - } - - cASSERT(mc, rc == MDBX_SUCCESS); - if (AUDIT_ENABLED()) - rc = cursor_check(mc); - return rc; - -bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return rc; -} - int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data) { int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); @@ -23269,8 +24400,8 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, return delete (txn, dbi, key, data, 0); } -static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { +static int delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { MDBX_cursor_couple cx; MDBX_cursor_op op; MDBX_val rdata; @@ -23303,7 +24434,7 @@ static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * cursor to be consistent until the end of the rebalance. */ cx.outer.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_del(&cx.outer, flags); + rc = cursor_del(&cx.outer, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; } return rc; @@ -23326,7 +24457,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, int rc = MDBX_SUCCESS, foliage = 0; size_t i, ptop; MDBX_env *const env = mc->mc_txn->mt_env; - MDBX_val sepkey, rkey, xdata; + MDBX_val rkey, xdata; MDBX_page *tmp_ki_copy = NULL; DKBUF; @@ -23339,7 +24470,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, return rc; } STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (mp->mp_flags & P_BRANCH) + 1; + const size_t minkeys = (mp->mp_flags & P_BRANCH) + (size_t)1; DEBUG(">> splitting %s-page %" PRIaPGNO " and adding %zu+%zu [%s] at %i, nkeys %zi", @@ -23418,6 +24549,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); + MDBX_val sepkey = {nullptr, 0}; /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; @@ -23462,14 +24594,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { - char *split, *ins; - size_t lsize, rsize, ksize; /* Move half of the keys to the right sibling */ const intptr_t distance = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_xsize; - split = page_leaf2key(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); + size_t ksize = mc->mc_db->md_xsize; + void *const split = page_leaf2key(mp, split_indx, ksize); + size_t rsize = (nkeys - split_indx) * ksize; + size_t lsize = (nkeys - split_indx) * sizeof(indx_t); cASSERT(mc, mp->mp_lower >= lsize); mp->mp_lower -= (indx_t)lsize; cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); @@ -23482,10 +24612,11 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; if (distance < 0) { cASSERT(mc, ksize >= sizeof(indx_t)); - ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); + void *const ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); sepkey.iov_base = sister->mp_ptrs; - memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memmove(ptr_disp(ins, ksize), ins, + (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); @@ -23493,9 +24624,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { memcpy(sister->mp_ptrs, split, distance * ksize); - ins = page_leaf2key(sister, distance, ksize); + void *const ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); - memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); + memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), + rsize - distance * ksize); cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); @@ -23574,8 +24706,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); size = NODESIZE + node_ks(node) + sizeof(indx_t); if (IS_LEAF(mp)) size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) @@ -23611,8 +24742,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey = *newkey; if (split_indx != newindx) { MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + - PAGEHDRSZ); + ptr_disp(mp, tmp_ki_copy->mp_ptrs[split_indx] + PAGEHDRSZ); sepkey.iov_len = node_ks(node); sepkey.iov_base = node_key(node); } @@ -23644,7 +24774,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } /* root split? */ - ptop += mc->mc_snum - snum; + ptop += mc->mc_snum - (size_t)snum; /* Right page might now have changed parent. * Check if left page also changed parent. */ @@ -23697,7 +24827,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(rc != MDBX_SUCCESS)) goto done; - MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); + MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + (size_t)1); cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); } else { mn.mc_top--; @@ -23750,7 +24880,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, break; } } - } else if (!IS_LEAF2(mp)) { + } else if (tmp_ki_copy /* !IS_LEAF2(mp) */) { /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; @@ -23769,8 +24899,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = (indx_t)n; } else { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = node_key(node); rkey.iov_len = node_ks(node); if (IS_LEAF(mp)) { @@ -23923,7 +25052,7 @@ done: } int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, - unsigned flags) { + MDBX_put_flags_t flags) { int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23951,7 +25080,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /* LY: support for update (explicit overwrite) */ if (flags & MDBX_CURRENT) { - rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET); + rc = cursor_set(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET).err; if (likely(rc == MDBX_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) { @@ -23967,7 +25096,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, } if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_cursor_put(&cx.outer, key, data, flags); + rc = cursor_put_checklen(&cx.outer, key, data, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; return rc; @@ -24065,7 +25194,7 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, assert(pgno == 0 || bytes > PAGEHDRSZ); while (bytes > 0) { const size_t side = ctx->mc_head & 1; - const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; + const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; if (left < (pgno ? PAGEHDRSZ : 1)) { int err = compacting_toggle_write_buffers(ctx); if (unlikely(err != MDBX_SUCCESS)) @@ -24088,7 +25217,7 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, } pgno = 0; } - src = (const char *)src + chunk; + src = ptr_disp(src, chunk); } else memset(dst, 0, chunk); bytes -= chunk; @@ -24121,8 +25250,7 @@ static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, if (unlikely(err != MDBX_SUCCESS)) return err; return compacting_put_bytes( - ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0, - 0); + ctx, ptr_disp(mp, ctx->mc_env->me_psize - tail_bytes), tail_bytes, 0, 0); } __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, @@ -24138,18 +25266,18 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, return rc; /* Make cursor pages writable */ - char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); + void *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; - char *ptr = buf; + void *ptr = buf; for (size_t i = 0; i < mc->mc_top; i++) { - page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); - mc->mc_pg[i] = (MDBX_page *)ptr; - ptr += ctx->mc_env->me_psize; + page_copy(ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + mc->mc_pg[i] = ptr; + ptr = ptr_disp(ptr, ctx->mc_env->me_psize); } /* This is writable space for a leaf page. Usually not needed. */ - MDBX_page *const leaf = (MDBX_page *)ptr; + MDBX_page *const leaf = ptr; while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; @@ -24304,7 +25432,7 @@ __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; - const pgno_t aligner = pv2pages( + const size_t aligner = pv2pages( meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); if (aligner) { const pgno_t aligned = pgno_align2os_pgno( @@ -24339,7 +25467,8 @@ __cold static void meta_make_sizeable(MDBX_meta *meta) { /* Copy environment with compaction. */ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { + const bool dest_is_pipe, + const MDBX_copy_flags_t flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); @@ -24375,7 +25504,7 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, read_txn->mt_dbs[FREE_DBI].md_leaf_pages + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; MDBX_val key, data; - while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == + while ((rc = cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == MDBX_SUCCESS) { const MDBX_PNL pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || @@ -24486,7 +25615,8 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, /* Copy environment as-is. */ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { + const bool dest_is_pipe, + const MDBX_copy_flags_t flags) { /* We must start the actual read txn after blocking writers */ int rc = txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) @@ -24510,8 +25640,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - (MDBX_meta *)(buffer + - ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map)); + ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) @@ -24571,7 +25700,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, break; rc = errno; if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), - maybe useful for others fs */ + maybe useful for others FS */ EINVAL) not_the_same_filesystem = true; else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) @@ -24586,7 +25715,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, ? (size_t)MDBX_ENVCOPY_WRITEBUF : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ - memcpy(data_buffer, env->me_map + offset, chunk); + memcpy(data_buffer, ptr_disp(env->me_map, offset), chunk); rc = osal_write(fd, data_buffer, chunk); offset += chunk; } @@ -24613,7 +25742,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, } __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, - unsigned flags) { + MDBX_copy_flags_t flags) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -24682,13 +25811,17 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *dest_pathW = nullptr; - OSAL_MB2WIDE(dest_path, dest_pathW); - return mdbx_env_copyW(env, dest_pathW, flags); + wchar_t *dest_pathW = nullptr; + int rc = osal_mb2w(dest_path, &dest_pathW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_copyW(env, dest_pathW, flags); + osal_free(dest_pathW); + } + return rc; } -LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, - MDBX_copy_flags_t flags) { +__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, + MDBX_copy_flags_t flags) { #endif /* Windows */ int rc = check_env(env, true); @@ -24823,19 +25956,7 @@ __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } -#if !(defined(_WIN32) || defined(_WIN64)) -__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!arg)) - return MDBX_EINVAL; - - *arg = env->me_pathname; - return MDBX_SUCCESS; -} -#else +#if defined(_WIN32) || defined(_WIN64) __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -24849,6 +25970,51 @@ __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { } #endif /* Windows */ +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + +#if defined(_WIN32) || defined(_WIN64) + if (!env->me_pathname_char) { + *arg = nullptr; + DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; + size_t mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + if (rc == ERROR_INVALID_FLAGS) { + mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + char *const mb_pathname = osal_malloc(mb_len); + if (!mb_pathname) + return MDBX_ENOMEM; + if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, + env->me_pathname, -1, mb_pathname, + (int)mb_len, nullptr, nullptr)) { + rc = (int)GetLastError(); + osal_free(mb_pathname); + return rc; + } + if (env->me_pathname_char || + InterlockedCompareExchangePointer( + (PVOID volatile *)&env->me_pathname_char, mb_pathname, nullptr)) + osal_free(mb_pathname); + } + *arg = env->me_pathname_char; +#else + *arg = env->me_pathname; +#endif /* Windows */ + return MDBX_SUCCESS; +} + __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -24861,12 +26027,6 @@ __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { return MDBX_SUCCESS; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) { - return __inline_mdbx_env_stat(env, stat, bytes); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) { st->ms_depth = db->md_depth; st->ms_branch_pages = db->md_branch_pages; @@ -25038,13 +26198,6 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, - size_t bytes) { - return __inline_mdbx_env_info(env, info, bytes); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *arg, const size_t bytes) { @@ -25052,7 +26205,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); /* is the environment open? - * (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171) */ + * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ if (unlikely(!env->me_map)) { /* environment not yet opened */ #if 1 @@ -25175,6 +26328,10 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.prefault = + atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); + arg->mi_pgop_stat.mincore = + atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); arg->mi_pgop_stat.msync = atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); arg->mi_pgop_stat.fsync = @@ -25248,13 +26405,13 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, } } -static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags) { +static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags) { return (flags & MDBX_REVERSEKEY) ? cmp_reverse : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 : cmp_lexical; } -static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { +static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags) { return !(flags & MDBX_DUPSORT) ? cmp_lenfast : ((flags & MDBX_INTEGERDUP) @@ -25264,7 +26421,7 @@ static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - /* LY: so, accepting only three cases for the table's flags: + /* Accepting only three cases: * 1) user_flags and both comparators are zero * = assume that a by-default mode/flags is requested for reading; * 2) user_flags exactly the same @@ -25286,6 +26443,10 @@ static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, /* make sure flags changes get committed */ txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS; txn->mt_flags |= MDBX_TXN_DIRTY; + /* обнуляем компараторы для установки в соответствии с флагами, + * либо заданных пользователем */ + txn->mt_dbxs[dbi].md_cmp = nullptr; + txn->mt_dbxs[dbi].md_dcmp = nullptr; } else { return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; } @@ -25312,22 +26473,34 @@ static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, return MDBX_SUCCESS; } -static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, - MDBX_dbi *dbi, MDBX_cmp_func *keycmp, +static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, + unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { int rc = MDBX_EINVAL; if (unlikely(!dbi)) return rc; + void *clone = nullptr; + bool locked = false; if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { - early_bailout: + bailout: + tASSERT(txn, MDBX_IS_ERROR(rc)); *dbi = 0; + if (locked) + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); + osal_free(clone); return rc; } rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; + + if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { + rc = MDBX_EACCESS; + goto bailout; + } switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_ACCEDE)) { @@ -25337,7 +26510,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, __fallthrough /* fall through */; default: rc = MDBX_EINVAL; - goto early_bailout; + goto bailout; case MDBX_DUPSORT: case MDBX_DUPSORT | MDBX_REVERSEDUP: @@ -25350,39 +26523,74 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* main table? */ - if (!table_name) { + if (table_name == MDBX_PGWALK_MAIN || + table_name->iov_base == MDBX_PGWALK_MAIN) { rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = MAIN_DBI; return rc; } + if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + *dbi = FREE_DBI; + return rc; + } + if (table_name == MDBX_PGWALK_META || + table_name->iov_base == MDBX_PGWALK_META) { + rc = MDBX_EINVAL; + goto bailout; + } - MDBX_env *env = txn->mt_env; - size_t len = strlen(table_name); - if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) + MDBX_val key = *table_name; + MDBX_env *const env = txn->mt_env; + if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) return MDBX_EINVAL; - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + /* Cannot mix named table(s) with DUPSORT flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { + if ((user_flags & MDBX_CREATE) == 0) { + rc = MDBX_NOTFOUND; + goto bailout; + } + if (txn->mt_dbs[MAIN_DBI].md_leaf_pages || txn->mt_dbxs[MAIN_DBI].md_cmp) { + /* В MAIN_DBI есть записи либо она уже использовалась. */ + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + /* Пересоздаём MAIN_DBI если там пусто. */ + atomic_store32(&txn->mt_dbiseqs[MAIN_DBI], dbi_seq(env, MAIN_DBI), + mo_AcquireRelease); + tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && + txn->mt_dbs[MAIN_DBI].md_entries == 0 && + txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); + txn->mt_dbs[MAIN_DBI].md_flags &= MDBX_REVERSEKEY | MDBX_INTEGERKEY; + txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); txn->mt_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } + tASSERT(txn, txn->mt_dbxs[MAIN_DBI].md_cmp); + /* Is the DB already open? */ MDBX_dbi scan, slot; for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = scan; return rc; } @@ -25391,84 +26599,80 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Fail, if no free slot and max hit */ if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto early_bailout; - } - - /* Cannot mix named table with some main-table flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & - (MDBX_DUPSORT | MDBX_INTEGERKEY))) { - rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; - goto early_bailout; + goto bailout; } /* Find the DB info */ - MDBX_val key, data; - key.iov_len = len; - key.iov_base = (void *)table_name; + MDBX_val data; MDBX_cursor_couple couple; rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - goto early_bailout; + goto bailout; } else { /* make sure this is actually a table */ MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], couple.outer.mc_ki[couple.outer.mc_top]); if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { rc = MDBX_INCOMPATIBLE; - goto early_bailout; + goto bailout; } if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; - goto early_bailout; + goto bailout; } } if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { rc = MDBX_EACCESS; - goto early_bailout; + goto bailout; } /* Done here so we cannot fail after creating a new DB */ - char *namedup = osal_strdup(table_name); - if (unlikely(!namedup)) { - rc = MDBX_ENOMEM; - goto early_bailout; - } + if (key.iov_len) { + clone = osal_malloc(key.iov_len); + if (unlikely(!clone)) { + rc = MDBX_ENOMEM; + goto bailout; + } + key.iov_base = memcpy(clone, key.iov_base, key.iov_len); + } else + key.iov_base = ""; int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - osal_free(namedup); - goto early_bailout; + goto bailout; } + locked = true; /* Import handles from env */ dbi_import_locked(txn); /* Rescan after mutex acquisition & import handles */ for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; - *dbi = scan; - goto later_exit; + goto bailout; + slot = scan; + goto done; } } if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto later_bailout; + goto bailout; } unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; @@ -25482,12 +26686,11 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; data.iov_len = sizeof(db_dummy); data.iov_base = &db_dummy; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, &key, &data, - F_SUBDATA | MDBX_NOOVERWRITE)); - + WITH_CURSOR_TRACKING( + couple.outer, rc = cursor_put_checklen(&couple.outer, &key, &data, + F_SUBDATA | MDBX_NOOVERWRITE)); if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; + goto bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; @@ -25501,43 +26704,65 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { tASSERT(txn, (dbiflags & DBI_CREAT) == 0); - later_bailout: - *dbi = 0; - later_exit: - osal_free(namedup); - } else { - txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name.iov_base = namedup; - txn->mt_dbxs[slot].md_name.iov_len = len; - txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = - dbi_seq(env, slot); - if (!(dbiflags & DBI_CREAT)) - env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - if (txn->mt_numdbs == slot) { - txn->mt_cursors[slot] = NULL; - osal_compiler_barrier(); - txn->mt_numdbs = slot + 1; - } - if (env->me_numdbs <= slot) { - osal_memory_fence(mo_AcquireRelease, true); - env->me_numdbs = slot + 1; - } - *dbi = slot; + goto bailout; } + txn->mt_dbistate[slot] = (uint8_t)dbiflags; + txn->mt_dbxs[slot].md_name = key; + txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); + if (!(dbiflags & DBI_CREAT)) + env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + if (txn->mt_numdbs == slot) { + txn->mt_cursors[slot] = NULL; + osal_compiler_barrier(); + txn->mt_numdbs = slot + 1; + } + if (env->me_numdbs <= slot) { + osal_memory_fence(mo_AcquireRelease, true); + env->me_numdbs = slot + 1; + } + +done: + *dbi = slot; ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return rc; + return MDBX_SUCCESS; } -int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi) { - return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr); +static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_PGWALK_MAIN || name_cstr == MDBX_PGWALK_GC || + name_cstr == MDBX_PGWALK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } -int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); +int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp); +} + +int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, @@ -25575,7 +26800,7 @@ static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(dbi >= env->me_numdbs)) return MDBX_BAD_DBI; - char *ptr = env->me_dbxs[dbi].md_name.iov_base; + char *const ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) return MDBX_BAD_DBI; @@ -25602,6 +26827,12 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (unlikely(dbi < CORE_DBS)) + return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI; + + if (unlikely(dbi >= env->me_maxdbs)) + return MDBX_BAD_DBI; + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_BAD_DBI; @@ -25634,12 +26865,6 @@ int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, return MDBX_SUCCESS; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { - return __inline_mdbx_dbi_flags(txn, dbi, flags); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { int rc = page_search(mc, NULL, MDBX_PS_FIRST); if (likely(rc == MDBX_SUCCESS)) { @@ -25652,9 +26877,10 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) cursor_pop(mc); - rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages + - mc->mc_db->md_leaf_pages + - mc->mc_db->md_overflow_pages); + rc = pnl_need(&txn->tw.retired_pages, + (size_t)mc->mc_db->md_branch_pages + + (size_t)mc->mc_db->md_leaf_pages + + (size_t)mc->mc_db->md_overflow_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -26032,7 +27258,8 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, return rc; } -__cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { +__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + MDBX_debug_func *logger) { const int rc = runtime_flags | (loglevel << 16); if (level != MDBX_LOG_DONTCHANGE) @@ -26148,16 +27375,6 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, return oldest; } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { - return __inline_mdbx_env_set_syncbytes(env, threshold); -} - -__cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { - return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { int rc = check_env(env, false); if (unlikely(rc != MDBX_SUCCESS)) @@ -26216,7 +27433,7 @@ typedef struct mdbx_walk_ctx { } mdbx_walk_ctx_t; __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep); + const MDBX_val *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) @@ -26237,7 +27454,8 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { /* Depth-first tree traversal. */ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const char *name, int deep, txnid_t parent_txnid) { + const MDBX_val *name, int deep, + txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); @@ -26403,33 +27621,22 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } assert(type == MDBX_page_leaf); - MDBX_db db; switch (node_flags(node)) { default: continue; - case F_SUBDATA /* sub-db */: { - const size_t namelen = node_ks(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + case F_SUBDATA /* sub-db */: + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - break; + } else { + MDBX_db db; + memcpy(&db, node_data(node), sizeof(db)); + const MDBX_val subdb_name = {node_key(node), node_ks(node)}; + assert(err == MDBX_SUCCESS); + err = walk_sdb(ctx, &db, &subdb_name, deep + 1); } - - char namebuf_onstask[64]; - char *const sub_name = (namelen < sizeof(namebuf_onstask)) - ? namebuf_onstask - : osal_malloc(namelen + 1); - if (unlikely(!sub_name)) - return MDBX_ENOMEM; - memcpy(sub_name, node_key(node), namelen); - sub_name[namelen] = 0; - memcpy(&db, node_data(node), sizeof(db)); - assert(err == MDBX_SUCCESS); - err = walk_sdb(ctx, &db, sub_name, deep + 1); - if (sub_name != namebuf_onstask) - osal_free(sub_name); - } break; + break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_ds(node) != sizeof(MDBX_db) || @@ -26437,6 +27644,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { + MDBX_db db; memcpy(&db, node_data(node), sizeof(db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); @@ -26460,7 +27668,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep) { + const MDBX_val *name, int deep) { if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ @@ -26825,7 +28033,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, } next.outer.mc_signature = MDBX_MC_LIVE; - rc = mdbx_cursor_get(&next.outer, key, data, move_op); + rc = cursor_get(&next.outer, key, data, move_op); if (unlikely(rc != MDBX_SUCCESS && (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) return rc; @@ -27054,7 +28262,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, /* убираем лишний бит, он был признаком запрошенного режима */ flags -= MDBX_NOOVERWRITE; - rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH); + rc = cursor_set(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; if (rc != MDBX_SUCCESS) goto bailout; } else { @@ -27062,7 +28270,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) return MDBX_EINVAL; MDBX_val present_data; - rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY); + rc = cursor_set(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { old_data->iov_base = NULL; old_data->iov_len = 0; @@ -27112,9 +28320,9 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, } if (likely(new_data)) - rc = mdbx_cursor_put(&cx.outer, key, new_data, flags); + rc = cursor_put_checklen(&cx.outer, key, new_data, flags); else - rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); bailout: txn->mt_cursors[dbi] = cx.outer.mc_next; @@ -27168,7 +28376,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { return rc; const MDBX_env *env = txn->mt_env; - const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; + const ptrdiff_t offset = ptr_dist(ptr, env->me_map); if (offset >= 0) { const pgno_t pgno = bytes2pgno(env, offset); if (likely(pgno < txn->mt_next_pgno)) { @@ -27240,16 +28448,6 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) { - return __inline_mdbx_limits_pgsize_min(); -} - -__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { - return __inline_mdbx_limits_pgsize_max(); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { if (pagesize < 1) pagesize = (intptr_t)mdbx_default_pagesize(); @@ -27284,8 +28482,8 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); const uint64_t pgl_limit = - pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482); - const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482); + pagesize * (uint64_t)(MDBX_PGL_LIMIT / MDBX_GOLD_RATIO_DBL); + const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL); return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; } @@ -27353,16 +28551,6 @@ uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { return float2key(ieee754_32bit); } -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API -MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) { - return __inline_mdbx_key_from_int64(i64); -} - -MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) { - return __inline_mdbx_key_from_int32(i32); -} -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ - #define IEEE754_DOUBLE_MANTISSA_SIZE 52 #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF @@ -27434,8 +28622,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); @@ -27460,8 +28648,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); @@ -27522,11 +28710,11 @@ int64_t mdbx_int64_from_key(const MDBX_val v) { UINT64_C(0x8000000000000000)); } -__cold MDBX_cmp_func *mdbx_get_keycmp(unsigned flags) { +__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) { return get_default_keycmp(flags); } -__cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) { +__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) { return get_default_datacmp(flags); } @@ -27541,14 +28729,14 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: - if (value == UINT64_MAX) - value = SIZE_MAX - 65536; + if (value == /* default */ UINT64_MAX) + value = MAX_WRITE; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > SIZE_MAX - 65536)) - return MDBX_TOO_LARGE; + return MDBX_EINVAL; value = bytes2pgno(env, (size_t)value + env->me_psize - 1); if ((uint32_t)value != atomic_load32(&env->me_lck->mti_autosync_threshold, mo_AcquireRelease) && @@ -27564,14 +28752,14 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_sync_period: - if (value == UINT64_MAX) - value = UINT32_MAX; + if (value == /* default */ UINT64_MAX) + value = 2780315 /* 42.42424 секунды */; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > UINT32_MAX)) - return MDBX_TOO_LARGE; + return MDBX_EINVAL; value = osal_16dot16_to_monotime((uint32_t)value); if (value != atomic_load64(&env->me_lck->mti_autosync_period, mo_AcquireRelease) && @@ -27586,8 +28774,8 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_max_db: - if (value == UINT64_MAX) - value = MDBX_MAX_DBI; + if (value == /* default */ UINT64_MAX) + value = 42; if (unlikely(value > MDBX_MAX_DBI)) return MDBX_EINVAL; if (unlikely(env->me_map)) @@ -27596,7 +28784,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_max_readers: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_READERS_LIMIT; if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) return MDBX_EINVAL; @@ -27606,7 +28794,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_dp_reserve_limit: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = INT_MAX; if (unlikely(value > INT_MAX)) return MDBX_EINVAL; @@ -27622,26 +28810,30 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, eASSERT(env, env->me_dp_reserve != NULL); MDBX_page *dp = env->me_dp_reserve; MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; - VALGRIND_MEMPOOL_FREE(env, dp); - osal_free(dp); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); env->me_dp_reserve_len -= 1; } } break; case MDBX_opt_rp_augment_limit: - if (value == UINT64_MAX) - value = MDBX_PGL_LIMIT; - if (unlikely(value > MDBX_PGL_LIMIT)) + if (value == /* default */ UINT64_MAX) { + env->me_options.flags.non_auto.rp_augment_limit = 0; + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + } else if (unlikely(value > MDBX_PGL_LIMIT)) return MDBX_EINVAL; - env->me_options.rp_augment_limit = (unsigned)value; + else { + env->me_options.flags.non_auto.rp_augment_limit = 1; + env->me_options.rp_augment_limit = (unsigned)value; + } break; case MDBX_opt_txn_dp_limit: case MDBX_opt_txn_dp_initial: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_PGL_LIMIT; if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) return MDBX_EINVAL; @@ -27676,40 +28868,73 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_spill_max_denominator: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_max_denominator = (uint8_t)value; break; case MDBX_opt_spill_min_denominator: + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_min_denominator = (uint8_t)value; break; case MDBX_opt_spill_parent4child_denominator: + if (value == /* default */ UINT64_MAX) + value = 0; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_parent4child_denominator = (uint8_t)value; break; case MDBX_opt_loose_limit: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 64; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.dp_loose_limit = (uint8_t)value; break; case MDBX_opt_merge_threshold_16dot16_percent: - if (value == UINT64_MAX) - value = 32768; + if (value == /* default */ UINT64_MAX) + value = 65536 / 4 /* 25% */; if (unlikely(value < 8192 || value > 32768)) return MDBX_EINVAL; env->me_options.merge_threshold_16dot16_percent = (unsigned)value; recalculate_merge_threshold(env); break; + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + /* позволяем "установить" значение по-умолчанию и совпадающее + * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ + if (value == /* default */ UINT64_MAX && + value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX)) + err = MDBX_EINVAL; +#else + if (value == /* default */ UINT64_MAX) + value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; + if (value != (unsigned)value) + err = MDBX_EINVAL; + else + env->me_options.writethrough_threshold = (unsigned)value; +#endif + break; + + case MDBX_opt_prefault_write_enable: + if (value == /* default */ UINT64_MAX) { + env->me_options.prefault_write = default_prefault_write(env); + env->me_options.flags.non_auto.prefault_write = false; + } else if (value > 1) + err = MDBX_EINVAL; + else { + env->me_options.prefault_write = value != 0; + env->me_options.flags.non_auto.prefault_write = true; + } + break; + default: return MDBX_EINVAL; } @@ -27783,6 +29008,18 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, *pvalue = env->me_options.merge_threshold_16dot16_percent; break; + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + *pvalue = (env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX; +#else + *pvalue = env->me_options.writethrough_threshold; +#endif + break; + + case MDBX_opt_prefault_write_enable: + *pvalue = env->me_options.prefault_write; + break; + default: return MDBX_EINVAL; } @@ -27863,7 +29100,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { rss.rlim_cur = estimated_rss; if (rss.rlim_max < estimated_rss) - rss.rlim_max = used_range; + rss.rlim_max = estimated_rss; if (setrlimit(RLIMIT_RSS, &rss)) { rc = errno; WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", @@ -28122,6 +29359,109 @@ __cold void global_ctor(void) { #endif /* #if 0 */ } +/*------------------------------------------------------------------------------ + * Legacy API */ + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API + +LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, + MDBX_txn_flags_t flags, MDBX_txn **ret) { + return __inline_mdbx_txn_begin(env, parent, flags, ret); +} + +LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) { + return __inline_mdbx_txn_commit(txn); +} + +LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, + size_t bytes) { + return __inline_mdbx_env_stat(env, stat, bytes); +} + +LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, + size_t bytes) { + return __inline_mdbx_env_info(env, info, bytes); +} + +LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { + return __inline_mdbx_dbi_flags(txn, dbi, flags); +} + +LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) { + return __inline_mdbx_env_sync(env); +} + +LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) { + return __inline_mdbx_env_sync_poll(env); +} + +LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) { + return __inline_mdbx_env_close(env); +} + +LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return __inline_mdbx_env_set_mapsize(env, size); +} + +LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { + return __inline_mdbx_env_set_maxdbs(env, dbs); +} + +LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { + return __inline_mdbx_env_get_maxdbs(env, dbs); +} + +LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env, + unsigned readers) { + return __inline_mdbx_env_set_maxreaders(env, readers); +} + +LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env, + unsigned *readers) { + return __inline_mdbx_env_get_maxreaders(env, readers); +} + +LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { + return __inline_mdbx_env_set_syncbytes(env, threshold); +} + +LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env, + size_t *threshold) { + return __inline_mdbx_env_get_syncbytes(env, threshold); +} + +LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env, + unsigned seconds_16dot16) { + return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); +} + +LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env, + unsigned *seconds_16dot16) { + return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16); +} + +LIBMDBX_API __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t +mdbx_limits_pgsize_min(void) { + return __inline_mdbx_limits_pgsize_min(); +} + +LIBMDBX_API __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t +mdbx_limits_pgsize_max(void) { + return __inline_mdbx_limits_pgsize_max(); +} + +LIBMDBX_API MDBX_NOTHROW_CONST_FUNCTION uint64_t +mdbx_key_from_int64(const int64_t i64) { + return __inline_mdbx_key_from_int64(i64); +} + +LIBMDBX_API MDBX_NOTHROW_CONST_FUNCTION uint32_t +mdbx_key_from_int32(const int32_t i32) { + return __inline_mdbx_key_from_int32(i32); +} + +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + /******************************************************************************/ __dll_export @@ -28267,6 +29607,7 @@ __dll_export " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) + " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) #if MDBX_DISABLE_VALIDATION @@ -28358,7 +29699,10 @@ __dll_export }; #ifdef __SANITIZE_ADDRESS__ -LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { +#if !defined(_MSC_VER) || __has_attribute(weak) +LIBMDBX_API __attribute__((__weak__)) +#endif +const char *__asan_default_options(void) { return "symbolize=1:allow_addr2line=1:" #if MDBX_DEBUG "debug=1:" @@ -28386,7 +29730,7 @@ LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -28432,6 +29776,8 @@ static int ntstatus2errcode(NTSTATUS status) { OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); ov.Internal = status; + /* Zap: '_Param_(1)' could be '0' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387); return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS : (int)GetLastError(); } @@ -28466,6 +29812,8 @@ extern NTSTATUS NTAPI NtMapViewOfSection( extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, IN OPTIONAL PVOID BaseAddress); +/* Zap: Inconsistent annotation for 'NtClose'... */ +MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251) extern NTSTATUS NTAPI NtClose(HANDLE Handle); extern NTSTATUS NTAPI NtAllocateVirtualMemory( @@ -28696,17 +30044,16 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap) { va_list ones; va_copy(ones, ap); - int needed = vsnprintf(nullptr, 0, fmt, ap); + const int needed = vsnprintf(nullptr, 0, fmt, ones); + va_end(ones); if (unlikely(needed < 0 || needed >= INT_MAX)) { *strp = nullptr; - va_end(ones); return needed; } - *strp = osal_malloc(needed + 1); + *strp = osal_malloc(needed + (size_t)1); if (unlikely(*strp == nullptr)) { - va_end(ones); #if defined(_WIN32) || defined(_WIN64) SetLastError(MDBX_ENOMEM); #else @@ -28715,9 +30062,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, return -1; } - int actual = vsnprintf(*strp, needed + 1, fmt, ones); - va_end(ones); - + const int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ap); assert(actual == needed); if (unlikely(actual < 0)) { osal_free(*strp); @@ -28731,7 +30076,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - int rc = osal_vasprintf(strp, fmt, ap); + const int rc = osal_vasprintf(strp, fmt, ap); va_end(ap); return rc; } @@ -28930,19 +30275,32 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) -#ifndef WC_ERR_INVALID_CHARS -static const DWORD WC_ERR_INVALID_CHARS = - (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) - ? 0x00000080 - : 0; -#endif /* WC_ERR_INVALID_CHARS */ +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst) { + const size_t dst_wlen = MultiByteToWideChar( + CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0); + wchar_t *dst = *pdst; + int rc = ERROR_INVALID_NAME; + if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX)) + goto bailout; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, - size_t dst_n, - const char *src, - size_t src_n) { - return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, - (int)src_n, dst, (int)dst_n); + dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t)); + rc = MDBX_ENOMEM; + if (unlikely(!dst)) + goto bailout; + + *pdst = dst; + if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP, + MB_ERR_INVALID_CHARS, src, + -1, dst, (int)dst_wlen))) + return MDBX_SUCCESS; + + rc = ERROR_INVALID_NAME; +bailout: + if (*pdst) { + osal_free(*pdst); + *pdst = nullptr; + } + return rc; } #endif /* Windows */ @@ -28951,6 +30309,7 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, #if defined(_WIN32) || defined(_WIN64) #define ior_alignment_mask (ior->pagesize - 1) +#define ior_WriteFile_flag 1 #define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) static void ior_put_event(osal_ioring_t *ior, HANDLE event) { @@ -28989,16 +30348,18 @@ static size_t osal_iov_max; #undef OSAL_IOV_MAX #endif /* OSAL_IOV_MAX */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd) { +) { memset(ior, 0, sizeof(osal_ioring_t)); - ior->fd = fd; #if defined(_WIN32) || defined(_WIN64) - ior->flags = flags; + ior->overlapped_fd = overlapped_fd; + ior->direct = enable_direct && overlapped_fd; const unsigned pagesize = (unsigned)osal_syspagesize(); ior->pagesize = pagesize; ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); @@ -29011,7 +30372,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, assert(osal_iov_max > 0); #endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ - ior->boundary = (char *)(ior->pool + ior->allocated); + ior->boundary = ptr_disp(ior->pool, ior->allocated); return MDBX_SUCCESS; } @@ -29028,9 +30389,9 @@ static __inline size_t ior_offset(const ior_item_t *item) { static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { #if defined(ior_sgv_element) assert(sgvcnt > 0); - return (ior_item_t *)((char *)item + sizeof(ior_item_t) - - sizeof(ior_sgv_element) + - sizeof(ior_sgv_element) * sgvcnt); + return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); #else assert(sgvcnt == 1); (void)sgvcnt; @@ -29047,7 +30408,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, #if defined(_WIN32) || defined(_WIN64) const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); const bool use_gather = - (ior->flags & IOR_DIRECT) && ior->slots_left >= segments; + ior->direct && ior->overlapped_fd && ior->slots_left >= segments; #endif /* Windows */ ior_item_t *item = ior->pool; @@ -29060,31 +30421,32 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, ((bytes | (uintptr_t)data | ior->last_bytes | (uintptr_t)(uint64_t)item->sgv[0].Buffer) & ior_alignment_mask) == 0 && - ior->last_sgvcnt + segments < OSAL_IOV_MAX) { - assert((item->single.iov_len & 1) == 0); + ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) { + assert(ior->overlapped_fd); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); assert(item->sgv[ior->last_sgvcnt].Buffer == 0); ior->last_bytes += bytes; size_t i = 0; do { item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); - data = (char *)data + ior->pagesize; + data = ptr_disp(data, ior->pagesize); } while (++i < segments); ior->slots_left -= segments; item->sgv[ior->last_sgvcnt += segments].Buffer = 0; - assert((item->single.iov_len & 1) == 0); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); return MDBX_SUCCESS; } - const void *end = - (char *)(item->single.iov_base) + item->single.iov_len - 1; + const void *end = ptr_disp(item->single.iov_base, + item->single.iov_len - ior_WriteFile_flag); if (unlikely(end == data)) { - assert((item->single.iov_len & 1) != 0); + assert((item->single.iov_len & ior_WriteFile_flag) != 0); item->single.iov_len += bytes; return MDBX_SUCCESS; } #elif MDBX_HAVE_PWRITEV assert((int)item->sgvcnt > 0); - const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) + - item->sgv[item->sgvcnt - 1].iov_len; + const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base, + item->sgv[item->sgvcnt - 1].iov_len); if (unlikely(end == data)) { item->sgv[item->sgvcnt - 1].iov_len += bytes; ior->last_bytes += bytes; @@ -29101,7 +30463,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, return MDBX_SUCCESS; } #else - const void *end = (char *)(item->single.iov_base) + item->single.iov_len; + const void *end = ptr_disp(item->single.iov_base, item->single.iov_len); if (unlikely(end == data)) { item->single.iov_len += bytes; return MDBX_SUCCESS; @@ -29124,17 +30486,18 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, segments > OSAL_IOV_MAX) { /* WriteFile() */ item->single.iov_base = data; - item->single.iov_len = bytes + 1; - assert((item->single.iov_len & 1) != 0); + item->single.iov_len = bytes + ior_WriteFile_flag; + assert((item->single.iov_len & ior_WriteFile_flag) != 0); } else { /* WriteFileGather() */ + assert(ior->overlapped_fd); item->sgv[0].Buffer = PtrToPtr64(data); for (size_t i = 1; i < segments; ++i) { - data = (char *)data + ior->pagesize; + data = ptr_disp(data, ior->pagesize); item->sgv[slots_used].Buffer = PtrToPtr64(data); } item->sgv[slots_used].Buffer = 0; - assert((item->single.iov_len & 1) == 0); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); slots_used = segments; } ior->last_bytes = bytes; @@ -29162,11 +30525,13 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( #if defined(_WIN32) || defined(_WIN64) size_t offset = ior_offset(item); char *data = item->single.iov_base; - size_t bytes = item->single.iov_len - 1; + size_t bytes = item->single.iov_len - ior_WriteFile_flag; size_t i = 1; - if (bytes & 1) { + if (bytes & ior_WriteFile_flag) { data = Ptr64ToPtr(item->sgv[0].Buffer); bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { if (data + ior->pagesize != item->sgv[i].Buffer) { callback(ctx, offset, data, bytes); @@ -29197,7 +30562,7 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( } MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior) { +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; #if defined(_WIN32) || defined(_WIN64) @@ -29208,10 +30573,13 @@ osal_ioring_write(osal_ioring_t *ior) { LONG async_started = 0; for (ior_item_t *item = ior->pool; item <= ior->last;) { item->ov.Internal = STATUS_PENDING; - size_t i = 1, bytes = item->single.iov_len - 1; + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; r.wops += 1; - if (bytes & 1) { + if (bytes & ior_WriteFile_flag) { + assert(ior->overlapped_fd && fd == ior->overlapped_fd); bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; @@ -29223,11 +30591,10 @@ osal_ioring_write(osal_ioring_t *ior) { r.err = GetLastError(); bailout_rc: assert(r.err != MDBX_SUCCESS); - CancelIo(ior->fd); + CancelIo(fd); return r; } - if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr, - &item->ov)) { + if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) { assert(item->ov.Internal == 0 && WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); ior_put_event(ior, item->ov.hEvent); @@ -29237,7 +30604,7 @@ osal_ioring_write(osal_ioring_t *ior) { if (unlikely(r.err != ERROR_IO_PENDING)) { ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileGather", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileGather", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -29246,11 +30613,11 @@ osal_ioring_write(osal_ioring_t *ior) { assert(wait_for > ior->event_pool + ior->event_stack); *--wait_for = item->ov.hEvent; } - } else if (ior->flags & IOR_OVERLAPPED) { + } else if (fd == ior->overlapped_fd) { assert(bytes < MAX_WRITE); retry: item->ov.hEvent = ior; - if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov, + if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov, ior_wocr)) { async_started += 1; } else { @@ -29259,7 +30626,7 @@ osal_ioring_write(osal_ioring_t *ior) { default: ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -29270,7 +30637,7 @@ osal_ioring_write(osal_ioring_t *ior) { WARNING( "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -29288,12 +30655,12 @@ osal_ioring_write(osal_ioring_t *ior) { } else { assert(bytes < MAX_WRITE); DWORD written = 0; - if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written, + if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written, &item->ov)) { r.err = (int)GetLastError(); ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFile", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFile", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -29348,17 +30715,18 @@ osal_ioring_write(osal_ioring_t *ior) { assert(ior->async_waiting == ior->async_completed); for (ior_item_t *item = ior->pool; item <= ior->last;) { - size_t i = 1, bytes = item->single.iov_len - 1; - if (bytes & 1) { + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + if (bytes & ior_WriteFile_flag) { bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; } if (!HasOverlappedIoCompleted(&item->ov)) { DWORD written = 0; - if (unlikely( - !GetOverlappedResult(ior->fd, &item->ov, &written, true))) { + if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) { ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", "GetOverlappedResult", __Wpedantic_format_voidptr(item), @@ -29408,16 +30776,16 @@ osal_ioring_write(osal_ioring_t *ior) { #if MDBX_HAVE_PWRITEV assert(item->sgvcnt > 0); if (item->sgvcnt == 1) - r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len, item->offset); else - r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset); + r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset); // TODO: io_uring_prep_write(sqe, fd, ...); item = ior_next(item, item->sgvcnt); #else - r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len, + r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len, item->offset); item = ior_next(item, 1); #endif @@ -29438,14 +30806,19 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { #if defined(_WIN32) || defined(_WIN64) if (ior->last) { for (ior_item_t *item = ior->pool; item <= ior->last;) { - if (!HasOverlappedIoCompleted(&item->ov)) - CancelIoEx(ior->fd, &item->ov); + if (!HasOverlappedIoCompleted(&item->ov)) { + assert(ior->overlapped_fd); + CancelIoEx(ior->overlapped_fd, &item->ov); + } if (item->ov.hEvent && item->ov.hEvent != ior) ior_put_event(ior, item->ov.hEvent); size_t i = 1; - if ((item->single.iov_len & 1) == 0) + if ((item->single.iov_len & ior_WriteFile_flag) == 0) { + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) ++i; + } item = ior_next(item, i); } } @@ -29460,8 +30833,11 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { static void ior_cleanup(osal_ioring_t *ior, const size_t since) { osal_ioring_reset(ior); #if defined(_WIN32) || defined(_WIN64) - for (size_t i = since; i < ior->event_stack; ++i) + for (size_t i = since; i < ior->event_stack; ++i) { + /* Zap: Using uninitialized memory '**ior.event_pool' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); CloseHandle(ior->event_pool[i]); + } ior->event_stack = 0; #else (void)since; @@ -29473,13 +30849,12 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { #if defined(_WIN32) || defined(_WIN64) if (ior->state & IOR_STATE_LOCKED) return MDBX_SUCCESS; - const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) && - mdbx_SetFileIoOverlappedRange && - items > 7; + const bool useSetFileIoOverlappedRange = + ior->overlapped_fd && mdbx_SetFileIoOverlappedRange && items > 42; const size_t ceiling = useSetFileIoOverlappedRange ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) - : 4096; + : 1024; const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); items = bytes / sizeof(ior_item_t); #endif /* Windows */ @@ -29514,10 +30889,10 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { memset(ior->pool + ior->allocated, 0, sizeof(ior_item_t) * (items - ior->allocated)); ior->allocated = (unsigned)items; - ior->boundary = (char *)(ior->pool + ior->allocated); + ior->boundary = ptr_disp(ior->pool, ior->allocated); #if defined(_WIN32) || defined(_WIN64) if (useSetFileIoOverlappedRange) { - if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes)) + if (mdbx_SetFileIoOverlappedRange(ior->overlapped_fd, ptr, (ULONG)bytes)) ior->state += IOR_STATE_LOCKED; else return GetLastError(); @@ -29534,10 +30909,12 @@ MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { osal_memalign_free(ior->pool); osal_free(ior->event_pool); CloseHandle(ior->async_done); + if (ior->overlapped_fd) + CloseHandle(ior->overlapped_fd); #else osal_free(ior->pool); #endif - memset(ior, -1, sizeof(osal_ioring_t)); + memset(ior, 0, sizeof(osal_ioring_t)); } /*----------------------------------------------------------------------------*/ @@ -29562,6 +30939,50 @@ MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #endif } +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES) + return MDBX_RESULT_TRUE; + int err = GetLastError(); + return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) + ? MDBX_RESULT_FALSE + : err; +#else + if (access(pathname, F_OK) == 0) + return MDBX_RESULT_TRUE; + int err = errno; + return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err; +#endif +} + +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len) { + const pathchar_t *ext = nullptr; + for (size_t i = 0; i < len && pathname[i]; i++) + if (pathname[i] == '.') + ext = pathname + i; + else if (osal_isdirsep(pathname[i])) + ext = nullptr; + return (pathchar_t *)ext; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len) { +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = 0; i < len; ++i) { + pathchar_t a = l[i]; + pathchar_t b = r[i]; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -29695,7 +31116,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, flags |= O_CLOEXEC; #endif /* O_CLOEXEC */ - /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; @@ -29733,7 +31154,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, errno = EACCES /* restore errno if file exists */; } - /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", @@ -29857,7 +31278,7 @@ MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, #endif bytes -= written; offset += written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } @@ -29887,7 +31308,7 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, } #endif bytes -= written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } @@ -29944,10 +31365,11 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, while (1) { switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { case MDBX_SYNC_NONE: + case MDBX_SYNC_KICK: return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 case MDBX_SYNC_DATA: - if (fdatasync(fd) == 0) + if (likely(fdatasync(fd) == 0)) return MDBX_SUCCESS; break /* error */; #if defined(__linux__) || defined(__gnu_linux__) @@ -29957,7 +31379,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, #endif /* Linux */ #endif /* _POSIX_SYNCHRONIZED_IO > 0 */ default: - if (fsync(fd) == 0) + if (likely(fsync(fd) == 0)) return MDBX_SUCCESS; } @@ -30084,22 +31506,35 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { - uint8_t *ptr = (uint8_t *)map->address + offset; + if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) + return MDBX_SUCCESS; + + void *ptr = ptr_disp(map->base, offset); #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && + !FlushFileBuffers(map->fd)) + return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - assert(linux_kernel_version > 0x02061300); /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly - * tracks dirty pages and flushes them to storage as necessary. */ - return MDBX_SUCCESS; + * tracks dirty pages and flushes ones as necessary. */ + // + // However, this behavior may be changed in custom kernels, + // so just leave such optimization to the libc discretion. + // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. + // + // assert(linux_kernel_version > 0x02061300); + // if (mode_bits <= MDBX_SYNC_KICK) + // return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) return errno; - mode_bits &= ~MDBX_SYNC_DATA; + if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd)) + return errno; #endif - return osal_fsync(map->fd, mode_bits); + return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, @@ -30132,6 +31567,50 @@ MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, return MDBX_SUCCESS; } +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle) { +#if defined(_WIN32) || defined(_WIN64) + (void)handle; +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; + +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif + switch (type) { + case 0x28cd3d45 /* CRAMFS_MAGIC */: + case 0x858458f6 /* RAMFS_MAGIC */: + case 0x01021994 /* TMPFS_MAGIC */: + case 0x73717368 /* SQUASHFS_MAGIC */: + case 0x7275 /* ROMFS_MAGIC */: + return MDBX_RESULT_TRUE; + } + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#else + const char *const name = ""; + const size_t name_len = 0; +#endif + if (name_len) { + if (strncasecmp("tmpfs", name, 6) == 0 || + strncasecmp("mfs", name, 4) == 0 || + strncasecmp("ramfs", name, 6) == 0 || + strncasecmp("romfs", name, 6) == 0) + return MDBX_RESULT_TRUE; + } +#endif /* !Windows */ + + return MDBX_RESULT_FALSE; +} + static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) @@ -30435,13 +31914,12 @@ static int check_mmap_limit(const size_t limit) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t size, const size_t limit, - const unsigned options) { +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options) { assert(size <= limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; map->filesize = 0; #if defined(_WIN32) || defined(_WIN64) map->section = NULL; @@ -30457,6 +31935,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { err = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, err); if (err != MDBX_SUCCESS) return err; map->filesize = size; @@ -30465,9 +31944,16 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, #endif /* !Windows */ } else { err = osal_filesize(map->fd, &map->filesize); + VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err); if (err != MDBX_SUCCESS) return err; -#if !(defined(_WIN32) || defined(_WIN64)) +#if defined(_WIN32) || defined(_WIN64) + if (map->filesize < size) { + WARNING("file size (%zu) less than requested for mapping (%zu)", + (size_t)map->filesize, size); + size = (size_t)map->filesize; + } +#else map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; #endif /* !Windows */ } @@ -30493,7 +31979,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, : mdbx_RunningUnderWine() ? size : limit; err = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -30504,10 +31990,10 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, if (!NT_SUCCESS(err)) { NtClose(map->section); map->section = 0; - map->address = nullptr; + map->base = nullptr; return ntstatus2errcode(err); } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; @@ -30538,7 +32024,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, #define MAP_NORESERVE 0 #endif - map->address = mmap( + map->base = mmap( NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED | MAP_FILE | MAP_NORESERVE | (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | @@ -30546,10 +32032,10 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, : MAP_CONCEAL), map->fd, 0); - if (unlikely(map->address == MAP_FAILED)) { + if (unlikely(map->base == MAP_FAILED)) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -30557,39 +32043,37 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) return errno; #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* ! Windows */ - VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); + VALGRIND_MAKE_MEM_DEFINED(map->base, map->current); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current); return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 - */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, - (map->filesize && map->filesize < map->limit) - ? map->filesize - : map->limit); + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->filesize && map->filesize < map->limit) ? map->filesize + : map->limit); #if defined(_WIN32) || defined(_WIN64) if (map->section) NtClose(map->section); - NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(rc)) ntstatus2errcode(rc); #else - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } @@ -30597,31 +32081,44 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit) { + int rc = osal_filesize(map->fd, &map->filesize); + VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size, + limit, map->filesize); assert(size <= limit); + if (rc != MDBX_SUCCESS) { + map->filesize = 0; + return rc; + } + #if defined(_WIN32) || defined(_WIN64) assert(size != map->current || limit != map->limit || size < map->filesize); NTSTATUS status; LARGE_INTEGER SectionSize; - int err, rc = MDBX_SUCCESS; + int err; - if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && - /* workaround for Wine */ mdbx_NtExtendSection) { - /* growth rw-section */ - SectionSize.QuadPart = size; - status = mdbx_NtExtendSection(map->section, &SectionSize); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - map->current = size; - if (map->filesize < size) - map->filesize = size; - return MDBX_SUCCESS; + if (limit == map->limit && size > map->current) { + if ((flags & MDBX_RDONLY) && map->filesize >= size) { + map->current = size; + return MDBX_SUCCESS; + } else if (!(flags & MDBX_RDONLY) && + /* workaround for Wine */ mdbx_NtExtendSection) { + /* growth rw-section */ + SectionSize.QuadPart = size; + status = mdbx_NtExtendSection(map->section, &SectionSize); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; + } } if (limit > map->limit) { @@ -30630,7 +32127,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, return err; /* check ability of address space for growth before unmap */ - PVOID BaseAddress = (PBYTE)map->address + map->limit; + PVOID BaseAddress = (PBYTE)map->base + map->limit; SIZE_T RegionSize = limit - map->limit; status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, &RegionSize, MEM_RESERVE, PAGE_NOACCESS); @@ -30650,15 +32147,17 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, * - change size of mapped view; * - extend read-only mapping; * Therefore we should unmap/map entire section. */ - if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + if (size <= map->current && limit == map->limit) + return MDBX_SUCCESS; return MDBX_EPERM; + } /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 - */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); - status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); + status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(status)) return ntstatus2errcode(status); status = NtClose(map->section); @@ -30669,8 +32168,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, if (!NT_SUCCESS(status)) { bailout_ntstatus: err = ntstatus2errcode(status); - bailout: - map->address = NULL; + map->base = NULL; map->current = map->limit = 0; if (ReservedAddress) { ReservedSize = 0; @@ -30685,7 +32183,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, retry_file_and_section: /* resizing of the file may take a while, * therefore we reserve address space to avoid occupy it by other threads */ - ReservedAddress = map->address; + ReservedAddress = map->base; status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); if (!NT_SUCCESS(status)) { @@ -30695,13 +32193,9 @@ retry_file_and_section: if (flags & MDBX_MRESIZE_MAY_MOVE) /* the base address could be changed */ - map->address = NULL; + map->base = NULL; } - err = osal_filesize(map->fd, &map->filesize); - if (err != MDBX_SUCCESS) - goto bailout; - if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { err = osal_ftruncate(map->fd, size); if (err == MDBX_SUCCESS) @@ -30740,7 +32234,7 @@ retry_file_and_section: retry_mapview:; SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; status = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -30751,15 +32245,15 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { + map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { /* try remap at another base address */ - map->address = NULL; + map->base = NULL; goto retry_mapview; } NtClose(map->section); map->section = NULL; - if (map->address && (size != map->current || limit != map->limit)) { + if (map->base && (size != map->current || limit != map->limit)) { /* try remap with previously size and limit, * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; @@ -30771,25 +32265,24 @@ retry_mapview:; /* no way to recovery */ goto bailout_ntstatus; } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; #else /* Windows */ - map->filesize = 0; - int rc = osal_filesize(map->fd, &map->filesize); - if (rc != MDBX_SUCCESS) - return rc; - if (flags & MDBX_RDONLY) { + if (size > map->filesize) + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + else if (size < map->filesize && map->filesize > limit) + rc = MDBX_EPERM; map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; - if (map->current != size) - rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; } else { - if (map->filesize != size) { + if (size > map->filesize || + (size < map->filesize && (flags & MDBX_SHRINK_ALLOWED))) { rc = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, rc); if (rc != MDBX_SUCCESS) return rc; map->filesize = size; @@ -30803,7 +32296,7 @@ retry_mapview:; * - this allows us to clear the mask only within the file size * when closing the mapping. */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - (char *)map->address + size, + ptr_disp(map->base, size), ((map->current < map->limit) ? map->current : map->limit) - size); } map->current = size; @@ -30815,7 +32308,7 @@ retry_mapview:; if (limit < map->limit) { /* unmap an excess at end of mapping. */ // coverity[offset_free : FALSE] - if (unlikely(munmap(map->dxb + limit, map->limit - limit))) { + if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) { assert(errno != 0); return errno; } @@ -30828,10 +32321,10 @@ retry_mapview:; return err; assert(limit > map->limit); - uint8_t *ptr = MAP_FAILED; + void *ptr = MAP_FAILED; #if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) - ptr = mremap(map->address, map->limit, limit, + ptr = mremap(map->base, map->limit, limit, #if defined(MREMAP_MAYMOVE) (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : #endif /* MREMAP_MAYMOVE */ @@ -30860,11 +32353,11 @@ retry_mapview:; if (ptr == MAP_FAILED) { /* Try to mmap additional space beyond the end of mapping. */ - ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, + ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot, mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); - if (ptr == map->dxb + map->limit) + if (ptr == ptr_disp(map->base, map->limit)) /* успешно прилепили отображение в конец */ - ptr = map->dxb; + ptr = map->base; else if (ptr != MAP_FAILED) { /* the desired address is busy, unmap unsuitable one */ if (unlikely(munmap(ptr, limit - map->limit))) { @@ -30897,13 +32390,13 @@ retry_mapview:; return MDBX_UNABLE_EXTEND_MAPSIZE; } - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, + ptr = mmap(map->base, limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -30913,13 +32406,13 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, - map->fd, 0); + ptr = + mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { /* try to restore prev mapping */ // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, + ptr = mmap(map->base, map->limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -30929,21 +32422,20 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, - (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -30953,39 +32445,42 @@ retry_mapview:; } assert(ptr && ptr != MAP_FAILED); - if (map->address != ptr) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + if (map->base != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); - map->address = ptr; + map->base = ptr; } map->limit = limit; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) { + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) { assert(errno != 0); return errno; } #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* POSIX / Windows */ + /* Zap: Redundant code */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287); assert(rc != MDBX_SUCCESS || - (map->address != nullptr && map->address != MAP_FAILED && - map->current == size && map->limit == limit)); + (map->base != nullptr && map->base != MAP_FAILED && + map->current == size && map->limit == limit && + map->filesize >= size)); return rc; } @@ -31712,7 +33207,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, #ifndef xMDBX_ALLOY unsigned sys_pagesize; -MDBX_MAYBE_UNUSED unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity; #endif /* xMDBX_ALLOY */ void osal_ctor(void) { @@ -31738,6 +33233,7 @@ void osal_ctor(void) { assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); assert(sys_allocation_granularity >= sys_pagesize && sys_allocation_granularity % sys_pagesize == 0); + sys_pagesize_ln2 = log2n_powerof2(sys_pagesize); #if defined(__linux__) || defined(__gnu_linux__) posix_clockid = choice_monoclock(); @@ -31780,10 +33276,10 @@ __dll_export const struct MDBX_version_info mdbx_version = { 0, 12, - 2, + 6, 0, - {"2022-11-11T17:35:32+03:00", "cd8aa216aff5c70b45bd3afd46d417a95126dcc3", "9b062cf0c7d41297f756c7f7b897ed981022bdbf", - "v0.12.2-0-g9b062cf0"}, + {"2023-04-29T21:30:35+03:00", "44de01dd81ac366a7d37111eaf72726edebe5528", "c019631a8c88a98a11d814e4111a2a9ae8cb4099", + "v0.12.6-0-gc019631a"}, sourcery}; __dll_export @@ -31800,7 +33296,7 @@ __dll_export #endif const char *const mdbx_sourcery_anchor = sourcery; /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -31948,8 +33444,10 @@ static __inline int flock(HANDLE fd, unsigned flags, size_t offset, static __inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset, size_t bytes) { - return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags, - offset, bytes); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + return flock_with_event(fd4data, env->me_data_lock_event, flags, offset, + bytes); } static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { @@ -31969,7 +33467,7 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { #else #define DXB_MAXLEN UINT32_C(0x7ff00000) #endif -#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN +#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN #define DXB_WHOLE 0, DXB_MAXLEN int mdbx_txn_lock(MDBX_env *env, bool dontwait) { @@ -31988,25 +33486,35 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } - if (env->me_flags & MDBX_EXCLUSIVE) + if (env->me_flags & MDBX_EXCLUSIVE) { + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); return MDBX_SUCCESS; + } - int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int rc = flock_with_event(fd4data, env->me_data_lock_event, dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), DXB_BODY); if (rc == ERROR_LOCK_VIOLATION && dontwait) { SleepEx(0, true); - rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + rc = flock_with_event(fd4data, env->me_data_lock_event, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); if (rc == ERROR_LOCK_VIOLATION) { SleepEx(0, true); - rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + rc = flock_with_event(fd4data, env->me_data_lock_event, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); } } - if (rc == MDBX_SUCCESS) + if (rc == MDBX_SUCCESS) { + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); return rc; + } LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; @@ -32014,7 +33522,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { void mdbx_txn_unlock(MDBX_env *env) { if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - int err = funlock(env->me_fd4data, DXB_BODY); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int err = funlock(fd4data, DXB_BODY); if (err != MDBX_SUCCESS) mdbx_panic("%s failed: err %u", __func__, err); } @@ -32071,17 +33581,18 @@ static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; if ((*array)->count == limit) { - void *ptr = osal_realloc( - (limit > ARRAY_LENGTH((*array)->handles)) - ? *array - : /* don't free initial array on the stack */ NULL, - sizeof(mdbx_handle_array_t) + - sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles))); + mdbx_handle_array_t *const ptr = + osal_realloc((limit > ARRAY_LENGTH((*array)->handles)) + ? *array + : /* don't free initial array on the stack */ NULL, + sizeof(mdbx_handle_array_t) + + sizeof(HANDLE) * (limit * (size_t)2 - + ARRAY_LENGTH((*array)->handles))); if (!ptr) return MDBX_ENOMEM; if (limit == ARRAY_LENGTH((*array)->handles)) - memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); - *array = (mdbx_handle_array_t *)ptr; + *ptr = **array; + *array = ptr; (*array)->limit = limit * 2; } @@ -32247,18 +33758,20 @@ static void lck_unlock(MDBX_env *env) { SetLastError(ERROR_SUCCESS); } - if (env->me_fd4data != INVALID_HANDLE_VALUE) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + if (fd4data != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ do - err = funlock(env->me_fd4data, DXB_BODY); + err = funlock(fd4data, DXB_BODY); while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); SetLastError(ERROR_SUCCESS); do - err = funlock(env->me_fd4data, DXB_WHOLE); + err = funlock(fd4data, DXB_WHOLE); while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); @@ -32318,7 +33831,9 @@ static int internal_seize_lck(HANDLE lfd) { } MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - assert(env->me_fd4data != INVALID_HANDLE_VALUE); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + assert(fd4data != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -32350,7 +33865,7 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { return err; } jitter4testing(false); - err = funlock(env->me_fd4data, DXB_WHOLE); + err = funlock(fd4data, DXB_WHOLE); if (err != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", err); @@ -32360,8 +33875,10 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { } MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(env->me_fd4data != INVALID_HANDLE_VALUE); + assert(fd4data != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) @@ -32536,7 +34053,7 @@ static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -32550,7 +34067,7 @@ static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { // Remove from the readers list, spin, try again _InterlockedDecrement(&srwl->readerCount); - YieldProcessor(); + SwitchToThread(); } } @@ -32566,7 +34083,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -32581,7 +34098,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { // that we're the writer. while (srwl->readerCount != 0) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); - YieldProcessor(); + SwitchToThread(); } } @@ -32623,38 +34140,40 @@ MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; #endif /* GCC/MINGW */ static void mdbx_winnt_import(void) { - const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); - #define GET_PROC_ADDR(dll, ENTRY) \ mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) - if (GetProcAddress(hNtdll, "wine_get_version")) { - assert(mdbx_RunningUnderWine()); - } else { - GET_PROC_ADDR(hNtdll, NtFsControlFile); - GET_PROC_ADDR(hNtdll, NtExtendSection); - assert(!mdbx_RunningUnderWine()); + const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); + if (hNtdll) { + if (GetProcAddress(hNtdll, "wine_get_version")) { + assert(mdbx_RunningUnderWine()); + } else { + GET_PROC_ADDR(hNtdll, NtFsControlFile); + GET_PROC_ADDR(hNtdll, NtExtendSection); + assert(!mdbx_RunningUnderWine()); + } } const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); - GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); - GET_PROC_ADDR(hKernel32dll, GetTickCount64); - if (!mdbx_GetTickCount64) - mdbx_GetTickCount64 = stub_GetTickCount64; - if (!mdbx_RunningUnderWine()) { - GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); - GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); - GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); - GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); - GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + if (hKernel32dll) { + GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); + GET_PROC_ADDR(hKernel32dll, GetTickCount64); + if (!mdbx_GetTickCount64) + mdbx_GetTickCount64 = stub_GetTickCount64; + if (!mdbx_RunningUnderWine()) { + GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); + GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); + GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); + GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + } } - const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); - GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); -#undef GET_PROC_ADDR - - const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "InitializeSRWLock"); + const osal_srwlock_t_function init = + (osal_srwlock_t_function)(hKernel32dll + ? GetProcAddress(hKernel32dll, + "InitializeSRWLock") + : nullptr); if (init != NULL) { osal_srwlock_Init = init; osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( @@ -32672,6 +34191,12 @@ static void mdbx_winnt_import(void) { osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } + + const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); + if (hAdvapi32dll) { + GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); + } +#undef GET_PROC_ADDR } #if __GNUC_PREREQ(8, 0) @@ -32680,7 +34205,7 @@ static void mdbx_winnt_import(void) { #endif /* Windows LCK-implementation */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ index 14848abf5..45817290a 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -87,27 +87,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -117,8 +121,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -135,7 +139,7 @@ #include "mdbx.h++" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -428,8 +432,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -707,17 +711,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -728,17 +728,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -804,6 +800,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -975,7 +993,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1180,7 +1198,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1193,14 +1212,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1213,8 +1233,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1282,13 +1306,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1305,7 +1328,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1314,11 +1336,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1329,7 +1353,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1347,11 +1371,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1491,9 +1517,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1515,6 +1542,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1528,9 +1568,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1552,6 +1591,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1708,22 +1748,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1855,6 +1880,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1930,6 +1995,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1939,6 +2006,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1952,6 +2021,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1977,6 +2048,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2091,7 +2175,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2177,6 +2265,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2190,6 +2280,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2199,6 +2291,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2210,6 +2304,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2221,6 +2317,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2229,6 +2328,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2241,8 +2343,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2250,6 +2365,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2275,6 +2392,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2364,6 +2483,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2662,16 +2917,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2713,18 +2964,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2754,6 +3011,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2893,6 +3158,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2942,6 +3211,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3014,7 +3289,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3039,14 +3315,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3062,7 +3331,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3070,7 +3340,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3086,9 +3356,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3137,13 +3409,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3202,7 +3472,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3225,11 +3495,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3279,6 +3555,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3337,12 +3616,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3370,10 +3649,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3385,11 +3666,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3411,6 +3698,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3419,6 +3707,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3430,6 +3719,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3460,139 +3751,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3603,7 +3761,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3619,7 +3778,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3778,16 +3937,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3799,14 +3949,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -3890,7 +4040,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) // -// Copyright (c) 2020-2022, Leonid Yuriev . +// Copyright (c) 2020-2023, Leonid Yuriev . // SPDX-License-Identifier: Apache-2.0 // // Non-inline part of the libmdbx C++ API @@ -3905,6 +4055,12 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define __USE_MINGW_ANSI_STDIO 1 #endif /* MinGW */ +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(_MSC_VER) && defined(__SANITIZE_ADDRESS__) && \ + !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ + #include diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h index 4ed503661..cc4229813 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h @@ -25,7 +25,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2022, Leonid Yuriev +\authors Copyright (c) 2015-2023, Leonid Yuriev and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without @@ -77,10 +77,10 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #if defined(__riscv) || defined(__riscv__) || defined(__RISCV) || \ defined(__RISCV__) -#warning The RISC-V architecture is intentionally insecure by design. \ +#warning "The RISC-V architecture is intentionally insecure by design. \ Please delete this admonition at your own risk, \ if you make such decision informed and consciously. \ - Refer to https://clck.ru/32d9xH for more information. + Refer to https://clck.ru/32d9xH for more information." #endif /* RISC-V */ #ifdef _MSC_VER @@ -695,9 +695,9 @@ extern LIBMDBX_VERINFO_API const struct MDBX_build_info { * automatically (de)initialization, releasing reader lock table slots * and so on. * - * If MDBX is built as a DLL this is done out-of-the-box by DllEntry(), - * function which called automatically by Windows core with passing corresponding - * reason argument. + * If MDBX built as a DLL this is done out-of-the-box by DllEntry() function, + * which called automatically by Windows core with passing corresponding reason + * argument. * * Otherwise, if MDBX was built not as a DLL, some black magic * may be required depending of Windows version: @@ -881,7 +881,7 @@ enum MDBX_constants { /* DEBUG & LOGGING ************************************************************/ /** \addtogroup c_debug - * \note Most of debug feature enabled only when libmdbx is built with + * \note Most of debug feature enabled only when libmdbx built with * \ref MDBX_DEBUG build option. @{ */ /** Log level @@ -946,7 +946,7 @@ typedef enum MDBX_log_level_t MDBX_log_level_t; * * \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an * effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if - * libmdbx is built with \ref MDBX_DEBUG. */ + * libmdbx built with \ref MDBX_DEBUG. */ enum MDBX_debug_flags_t { MDBX_DBG_NONE = 0, @@ -1926,6 +1926,15 @@ enum MDBX_error_t { /** Overlapping read and write transactions for the current thread */ MDBX_TXN_OVERLAPPING = -30415, + /** Внутренняя ошибка возвращаемая в случае нехватки запаса свободных страниц + * при обновлении GC. Используется как вспомогательное средство для отладки. + * \note С точки зрения пользователя семантически + * равнозначна \ref MDBX_PROBLEM. */ + MDBX_BACKLOG_DEPLETED = -30414, + + /** Alternative/Duplicate LCK-file is exists and should be removed manually */ + MDBX_DUPLICATED_CLK = -30413, + /* The last of MDBX-added error codes */ MDBX_LAST_ADDED_ERRCODE = MDBX_TXN_OVERLAPPING, @@ -2051,7 +2060,9 @@ LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, * \returns a non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); -/** \brief MDBX environment options. */ +/** \brief MDBX environment extra runtime options. + * \ingroup c_settings + * \see mdbx_env_set_option() \see mdbx_env_get_option() */ enum MDBX_option_t { /** \brief Controls the maximum number of named databases for the environment. * @@ -2220,13 +2231,46 @@ enum MDBX_option_t { * to 50% (half empty) which corresponds to the range from 8192 and to 32768 * in units respectively. */ MDBX_opt_merge_threshold_16dot16_percent, + + /** \brief Controls the choosing between use write-through disk writes and + * usual ones with followed flush by the `fdatasync()` syscall. + * \details Depending on the operating system, storage subsystem + * characteristics and the use case, higher performance can be achieved by + * either using write-through or a serie of usual/lazy writes followed by + * the flush-to-disk. + * + * Basically for N chunks the latency/cost of write-through is: + * latency = N * (emit + round-trip-to-storage + storage-execution); + * And for serie of lazy writes with flush is: + * latency = N * (emit + storage-execution) + flush + round-trip-to-storage. + * + * So, for large N and/or noteable round-trip-to-storage the write+flush + * approach is win. But for small N and/or near-zero NVMe-like latency + * the write-through is better. + * + * To solve this issue libmdbx provide `MDBX_opt_writethrough_threshold`: + * - when N described above less or equal specified threshold, + * a write-through approach will be used; + * - otherwise, when N great than specified threshold, + * a write-and-flush approach will be used. + * + * \note MDBX_opt_writethrough_threshold affects only \ref MDBX_SYNC_DURABLE + * mode without \ref MDBX_WRITEMAP, and not supported on Windows. + * On Windows a write-through is used always but \ref MDBX_NOMETASYNC could + * be used for switching to write-and-flush. */ + MDBX_opt_writethrough_threshold, + + /** \brief Controls prevention of page-faults of reclaimed and allocated pages + * in the \ref MDBX_WRITEMAP mode by clearing ones through file handle before + * touching. */ + MDBX_opt_prefault_write_enable, }; #ifndef __cplusplus /** \ingroup c_settings */ typedef enum MDBX_option_t MDBX_option_t; #endif -/** \brief Sets the value of a runtime options for an environment. +/** \brief Sets the value of a extra runtime options for an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -2239,7 +2283,7 @@ typedef enum MDBX_option_t MDBX_option_t; LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, uint64_t value); -/** \brief Gets the value of runtime options from an environment. +/** \brief Gets the value of extra runtime options from an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -2260,6 +2304,8 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, * be called later to discard the \ref MDBX_env handle and release associated * resources. * + * \note On Windows the \ref mdbx_env_openW() is recommended to use. + * * \param [in] env An environment handle returned * by \ref mdbx_env_create() * @@ -2327,8 +2373,11 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); -#if defined(_WIN32) || defined(_WIN64) -LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathnameW, +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open() + * \note Available only on Windows. + * \see mdbx_env_open() */ +LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); #endif /* Windows */ @@ -2358,6 +2407,8 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; /** \brief Delete the environment's files in a proper and multiprocess-safe way. * \ingroup c_extra * + * \note On Windows the \ref mdbx_env_deleteW() is recommended to use. + * * \param [in] pathname The pathname for the database or the directory in which * the database files reside. * @@ -2374,8 +2425,12 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; * so no deletion was performed. */ LIBMDBX_API int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode); -#if defined(_WIN32) || defined(_WIN64) -LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_delete() + * \note Available only on Windows. + * \see mdbx_env_delete() */ +LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathname, MDBX_env_delete_mode_t mode); #endif /* Windows */ @@ -2388,6 +2443,8 @@ LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, * parallel with write transactions, because it employs a read-only * transaction. See long-lived transactions under \ref restrictions section. * + * \note On Windows the \ref mdbx_env_copyW() is recommended to use. + * * \param [in] env An environment handle returned by mdbx_env_create(). * It must have already been opened successfully. * \param [in] dest The pathname of a file in which the copy will reside. @@ -2412,7 +2469,11 @@ LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, MDBX_copy_flags_t flags); -#if defined(_WIN32) || defined(_WIN64) + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_copy() + * \note Available only on Windows. + * \see mdbx_env_copy() */ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest, MDBX_copy_flags_t flags); #endif /* Windows */ @@ -2555,16 +2616,18 @@ struct MDBX_envinfo { * first process opened the database after everyone had previously closed it). */ struct { - uint64_t newly; /**< Quantity of a new pages added */ - uint64_t cow; /**< Quantity of pages copied for update */ - uint64_t clone; /**< Quantity of parent's dirty pages clones - for nested transactions */ - uint64_t split; /**< Page splits */ - uint64_t merge; /**< Page merges */ - uint64_t spill; /**< Quantity of spilled dirty pages */ - uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ - uint64_t wops; /**< Number of explicit write operations (not a pages) - to a disk */ + uint64_t newly; /**< Quantity of a new pages added */ + uint64_t cow; /**< Quantity of pages copied for update */ + uint64_t clone; /**< Quantity of parent's dirty pages clones + for nested transactions */ + uint64_t split; /**< Page splits */ + uint64_t merge; /**< Page merges */ + uint64_t spill; /**< Quantity of spilled dirty pages */ + uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ + uint64_t wops; /**< Number of explicit write operations (not a pages) + to a disk */ + uint64_t prefault; /**< Number of prefault write operations (not a pages) */ + uint64_t mincore; /**< Number of mincore() calls */ uint64_t msync; /**< Number of explicit msync-to-disk operations (not a pages) */ uint64_t @@ -2951,6 +3014,8 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); /** \brief Return the path that was used in mdbx_env_open(). * \ingroup c_statinfo * + * \note On Windows the \ref mdbx_env_get_pathW() is recommended to use. + * * \param [in] env An environment handle returned by \ref mdbx_env_create() * \param [out] dest Address of a string pointer to contain the path. * This is the actual string in the environment, not a @@ -2959,9 +3024,12 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ -#if !(defined(_WIN32) || defined(_WIN64)) LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); -#else + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_get_path() + * \note Available only on Windows. + * \see mdbx_env_get_path() */ LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); #endif /* Windows */ @@ -2989,6 +3057,8 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * it is reasonable to know some details in order to make optimal decisions * when choosing parameters. * + * \see mdbx_env_info_ex() + * * Both \ref mdbx_env_set_geometry() and legacy \ref mdbx_env_set_mapsize() are * inapplicable to read-only opened environment. * @@ -3098,7 +3168,7 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * \note Actual values may be different than your have specified because of * rounding to specified database page size, the system page size and/or the * size of the system virtual memory management unit. You can get actual values - * by \ref mdbx_env_sync_ex() or see by using the tool `mdbx_chk` with the `-v` + * by \ref mdbx_env_info_ex() or see by using the tool `mdbx_chk` with the `-v` * option. * * Legacy \ref mdbx_env_set_mapsize() correspond to calling @@ -3766,13 +3836,10 @@ struct MDBX_commit_latency { /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри * GC ради данных пользователя. */ uint32_t work_rtime_monotonic; - /** \brief Монотонное время по "настенным часам" затраченное + /** \brief Время ЦПУ в режиме пользователе затраченное * на подготовку страниц извлекаемых из GC для данных пользователя, * включая подкачку с диска. */ - uint32_t work_xtime_monotonic; - /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск - * внтури GC ради данных пользователя. */ - uint32_t work_rtime_cpu; + uint32_t work_xtime_cpu; /** \brief Количество итераций поиска внутри GC при выделении страниц * ради данных пользователя. */ uint32_t work_rsteps; @@ -3789,13 +3856,10 @@ struct MDBX_commit_latency { /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри * GC для целей поддержки и обновления самой GC. */ uint32_t self_rtime_monotonic; - /** \brief Монотонное время по "настенным часам" затраченное на подготовку + /** \brief Время ЦПУ в режиме пользователе затраченное на подготовку * страниц извлекаемых из GC для целей поддержки и обновления самой GC, * включая подкачку с диска. */ - uint32_t self_xtime_monotonic; - /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск - * внтури GC для целей поддержки и обновления самой GC. */ - uint32_t self_rtime_cpu; + uint32_t self_xtime_cpu; /** \brief Количество итераций поиска внутри GC при выделении страниц * для целей поддержки и обновления самой GC. */ uint32_t self_rsteps; @@ -4128,6 +4192,8 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, * by current thread. */ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); +LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi); /** \deprecated Please * \ref avoid_custom_comparators "avoid using custom comparators" and use @@ -4147,6 +4213,9 @@ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +MDBX_DEPRECATED LIBMDBX_API int +mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); /** \defgroup value2key Value-to-Key functions * \brief Value-to-Key functions to @@ -5444,18 +5513,20 @@ typedef enum MDBX_page_type_t MDBX_page_type_t; #endif /** \brief Pseudo-name for MainDB */ -#define MDBX_PGWALK_MAIN ((const char *)((ptrdiff_t)0)) +#define MDBX_PGWALK_MAIN ((void *)((ptrdiff_t)0)) /** \brief Pseudo-name for GarbageCollectorDB */ -#define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) +#define MDBX_PGWALK_GC ((void *)((ptrdiff_t)-1)) /** \brief Pseudo-name for MetaPages */ -#define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) +#define MDBX_PGWALK_META ((void *)((ptrdiff_t)-2)) /** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int MDBX_pgvisitor_func( - const uint64_t pgno, const unsigned number, void *const ctx, const int deep, - const char *const dbi, const size_t page_size, const MDBX_page_type_t type, - const MDBX_error_t err, const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; +typedef int +MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; /** \brief B-tree traversal function. */ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, @@ -5466,13 +5537,20 @@ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, * * This function mostly of internal API for `mdbx_chk` utility and subject to * change at any time. Do not use this function to avoid shooting your own - * leg(s). */ + * leg(s). + * + * \note On Windows the \ref mdbx_env_open_for_recoveryW() is recommended + * to use. */ LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable); -#if defined(_WIN32) || defined(_WIN64) + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open_for_recovery() + * \note Available only on Windows. + * \see mdbx_env_open_for_recovery() */ LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, - const wchar_t *pathnameW, + const wchar_t *pathname, unsigned target_meta, bool writeable); #endif /* Windows */ diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ index c4563927e..ac956790e 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h++ @@ -1,7 +1,7 @@ /// \file mdbx.h++ /// \brief The libmdbx C++ API header file. /// -/// \author Copyright (c) 2020-2022, Leonid Yuriev . +/// \author Copyright (c) 2020-2023, Leonid Yuriev . /// \copyright SPDX-License-Identifier: Apache-2.0 /// /// Tested with: @@ -84,6 +84,11 @@ #include #endif +#if __cplusplus >= 201103L +#include +#include +#endif + #include "mdbx.h" #if (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L) || \ @@ -223,17 +228,18 @@ #endif /* MDBX_CXX20_UNLIKELY */ #ifndef MDBX_HAVE_CXX20_CONCEPTS -#if defined(DOXYGEN) || \ - (defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L) +#if defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L #include #define MDBX_HAVE_CXX20_CONCEPTS 1 +#elif defined(DOXYGEN) +#define MDBX_HAVE_CXX20_CONCEPTS 1 #else #define MDBX_HAVE_CXX20_CONCEPTS 0 #endif /* */ #endif /* MDBX_HAVE_CXX20_CONCEPTS */ #ifndef MDBX_CXX20_CONCEPT -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) #define MDBX_CXX20_CONCEPT(CONCEPT, NAME) CONCEPT NAME #else #define MDBX_CXX20_CONCEPT(CONCEPT, NAME) typename NAME @@ -241,7 +247,7 @@ #endif /* MDBX_CXX20_CONCEPT */ #ifndef MDBX_ASSERT_CXX20_CONCEPT_SATISFIED -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) #define MDBX_ASSERT_CXX20_CONCEPT_SATISFIED(CONCEPT, TYPE) \ static_assert(CONCEPT) #else @@ -287,7 +293,7 @@ namespace mdbx { // To enable all kinds of an compiler optimizations we use a byte-like type // that don't presumes aliases for pointers as does the `char` type and its // derivatives/typedefs. -// Please see https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/263 +// Please see https://libmdbx.dqdkfa.ru/dead-github/issues/263 // for reasoning of the use of `char8_t` type and switching to `__restrict__`. using byte = char8_t; #else @@ -350,6 +356,9 @@ class cursor_managed; __cpp_lib_memory_resource >= 201603L && _GLIBCXX_USE_CXX11_ABI) /// \brief Default polymorphic allocator for modern code. using polymorphic_allocator = ::std::pmr::string::allocator_type; +using default_allocator = polymorphic_allocator; +#else +using default_allocator = legacy_allocator; #endif /* __cpp_lib_memory_resource >= 201603L */ /// \brief Default singe-byte string. @@ -385,6 +394,11 @@ using path = ::std::wstring; using path = ::std::string; #endif /* mdbx::path */ +#if __cplusplus >= 201103L || defined(DOXYGEN) +/// \brief Duration in 1/65536 units of second. +using duration = ::std::chrono::duration>; +#endif /* Duration for C++11 */ + /// \defgroup cxx_exceptions exceptions and errors /// @{ @@ -551,8 +565,11 @@ static MDBX_CXX14_CONSTEXPR size_t check_length(size_t headroom, size_t payload, /// \defgroup cxx_data slices and buffers /// @{ -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) +/** \concept MutableByteProducer + * \interface MutableByteProducer + * \brief MutableByteProducer C++20 concept */ template concept MutableByteProducer = requires(T a, char array[42]) { { a.is_empty() } -> std::same_as; @@ -560,6 +577,9 @@ concept MutableByteProducer = requires(T a, char array[42]) { { a.write_bytes(&array[0], size_t(42)) } -> std::same_as; }; +/** \concept ImmutableByteProducer + * \interface ImmutableByteProducer + * \brief ImmutableByteProducer C++20 concept */ template concept ImmutableByteProducer = requires(const T &a, char array[42]) { { a.is_empty() } -> std::same_as; @@ -567,12 +587,15 @@ concept ImmutableByteProducer = requires(const T &a, char array[42]) { { a.write_bytes(&array[0], size_t(42)) } -> std::same_as; }; +/** \concept SliceTranscoder + * \interface SliceTranscoder + * \brief SliceTranscoder C++20 concept */ template -concept SliceTranscoder = ImmutableByteProducer && - requires(const slice &source, const T &a) { - T(source); - { a.is_erroneous() } -> std::same_as; -}; +concept SliceTranscoder = + ImmutableByteProducer && requires(const slice &source, const T &a) { + T(source); + { a.is_erroneous() } -> std::same_as; + }; #endif /* MDBX_HAVE_CXX20_CONCEPTS */ @@ -2639,45 +2662,69 @@ public: return buffer(src, make_reference); } - static buffer key_from(const silo &&src) noexcept { + static buffer key_from(silo &&src) noexcept { return buffer(::std::move(src)); } - static buffer key_from(const double ieee754_64bit) { + static buffer key_from_double(const double ieee754_64bit) { return wrap(::mdbx_key_from_double(ieee754_64bit)); } + static buffer key_from(const double ieee754_64bit) { + return key_from_double(ieee754_64bit); + } + static buffer key_from(const double *ieee754_64bit) { return wrap(::mdbx_key_from_ptrdouble(ieee754_64bit)); } - static buffer key_from(const uint64_t unsigned_int64) { + static buffer key_from_u64(const uint64_t unsigned_int64) { return wrap(unsigned_int64); } - static buffer key_from(const int64_t signed_int64) { + static buffer key_from(const uint64_t unsigned_int64) { + return key_from_u64(unsigned_int64); + } + + static buffer key_from_i64(const int64_t signed_int64) { return wrap(::mdbx_key_from_int64(signed_int64)); } + static buffer key_from(const int64_t signed_int64) { + return key_from_i64(signed_int64); + } + static buffer key_from_jsonInteger(const int64_t json_integer) { return wrap(::mdbx_key_from_jsonInteger(json_integer)); } - static buffer key_from(const float ieee754_32bit) { + static buffer key_from_float(const float ieee754_32bit) { return wrap(::mdbx_key_from_float(ieee754_32bit)); } + static buffer key_from(const float ieee754_32bit) { + return key_from_float(ieee754_32bit); + } + static buffer key_from(const float *ieee754_32bit) { return wrap(::mdbx_key_from_ptrfloat(ieee754_32bit)); } - static buffer key_from(const uint32_t unsigned_int32) { + static buffer key_from_u32(const uint32_t unsigned_int32) { return wrap(unsigned_int32); } - static buffer key_from(const int32_t signed_int32) { + static buffer key_from(const uint32_t unsigned_int32) { + return key_from_u32(unsigned_int32); + } + + static buffer key_from_i32(const int32_t signed_int32) { return wrap(::mdbx_key_from_int32(signed_int32)); } + + static buffer key_from(const int32_t signed_int32) { + return key_from_i32(signed_int32); + } }; template = 201103L || defined(DOXYGEN) /// \brief Sets relative period since the last unsteady commit to force flush /// the data buffers to disk, for non-sync durability modes. /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. + /// \details The relative period value affects all processes which operates + /// with given environment until the last process close environment or a new + /// value will be settled. Data is always written to disk when \ref + /// txn_managed::commit() is called, but the operating system may keep it + /// buffered. MDBX always flushes the OS buffers upon commit as well, unless + /// the environment was opened with \ref whole_fragile, \ref lazy_weak_tail or + /// in part \ref half_synchronous_weak_last. Settled period don't checked + /// asynchronously, but only by the \ref txn_managed::commit() and \ref + /// env::sync_to_disk() functions. Therefore, in cases where transactions are + /// committed infrequently and/or irregularly, polling by \ref + /// env::poll_sync_to_disk() may be a reasonable solution to timeout + /// enforcement. /// + /// The default is 0, than mean no any timeout checked, and no additional + /// flush will be made. + /// \see extra_runtime_option::sync_period + inline env &set_sync_period(const duration &period); + + /// \brief Gets relative period since the last unsteady commit that used to + /// force flush the data buffers to disk, for non-sync durability modes. + /// \copydetails set_sync_period(const duration&) + /// \see set_sync_period(const duration&) + /// \see extra_runtime_option::sync_period + inline duration sync_period() const; +#endif + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds_16dot16 The period in 1/65536 of second when a /// synchronous flush would be made since the last unsteady commit. - inline env &set_sync_period(unsigned seconds_16dot16); + inline env &set_sync_period__seconds_16dot16(unsigned seconds_16dot16); - /// \brief Sets relative period since the last unsteady commit to force flush - /// the data buffers to disk, for non-sync durability modes. - /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. - /// + /// \copydoc sync_period() + /// \see sync_period__seconds_16dot16(unsigned) + inline unsigned sync_period__seconds_16dot16() const; + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds The period in second when a synchronous flush would /// be made since the last unsteady commit. - inline env &set_sync_period(double seconds); + inline env &set_sync_period__seconds_double(double seconds); + + /// \copydoc sync_period() + /// \see set_sync_period__seconds_double(double) + inline double sync_period__seconds_double() const; + + /// \copydoc MDBX_option_t + enum class extra_runtime_option { + /// \copydoc MDBX_opt_max_db + /// \see max_maps() \see env::operate_parameters::max_maps + max_maps = MDBX_opt_max_db, + /// \copydoc MDBX_opt_max_readers + /// \see max_readers() \see env::operate_parameters::max_readers + max_readers = MDBX_opt_max_readers, + /// \copydoc MDBX_opt_sync_bytes + /// \see sync_threshold() \see set_sync_threshold() + sync_bytes = MDBX_opt_sync_bytes, + /// \copydoc MDBX_opt_sync_period + /// \see sync_period() \see set_sync_period() + sync_period = MDBX_opt_sync_period, + /// \copydoc MDBX_opt_rp_augment_limit + rp_augment_limit = MDBX_opt_rp_augment_limit, + /// \copydoc MDBX_opt_loose_limit + loose_limit = MDBX_opt_loose_limit, + /// \copydoc MDBX_opt_dp_reserve_limit + dp_reserve_limit = MDBX_opt_dp_reserve_limit, + /// \copydoc MDBX_opt_txn_dp_limit + dp_limit = MDBX_opt_txn_dp_limit, + /// \copydoc MDBX_opt_txn_dp_initial + dp_initial = MDBX_opt_txn_dp_initial, + /// \copydoc MDBX_opt_spill_max_denominator + spill_max_denominator = MDBX_opt_spill_max_denominator, + /// \copydoc MDBX_opt_spill_min_denominator + spill_min_denominator = MDBX_opt_spill_min_denominator, + /// \copydoc MDBX_opt_spill_parent4child_denominator + spill_parent4child_denominator = MDBX_opt_spill_parent4child_denominator, + /// \copydoc MDBX_opt_merge_threshold_16dot16_percent + merge_threshold_16dot16_percent = MDBX_opt_merge_threshold_16dot16_percent, + /// \copydoc MDBX_opt_writethrough_threshold + writethrough_threshold = MDBX_opt_writethrough_threshold, + /// \copydoc MDBX_opt_prefault_write_enable + prefault_write_enable = MDBX_opt_prefault_write_enable, + }; + + /// \copybrief mdbx_env_set_option() + inline env &set_extra_option(extra_runtime_option option, uint64_t value); + + /// \copybrief mdbx_env_get_option() + inline uint64_t extra_option(extra_runtime_option option) const; /// \brief Alter environment flags. inline env &alter_flags(MDBX_env_flags_t flags, bool on_off); @@ -3591,7 +3700,7 @@ public: void close(bool dont_sync = false); env_managed(env_managed &&) = default; - env_managed &operator=(env_managed &&other) { + env_managed &operator=(env_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); @@ -3890,7 +3999,7 @@ class LIBMDBX_API_TYPE txn_managed : public txn { public: MDBX_CXX11_CONSTEXPR txn_managed() noexcept = default; txn_managed(txn_managed &&) = default; - txn_managed &operator=(txn_managed &&other) { + txn_managed &operator=(txn_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); @@ -4112,7 +4221,7 @@ public: void close(); cursor_managed(cursor_managed &&) = default; - cursor_managed &operator=(cursor_managed &&other) { + cursor_managed &operator=(cursor_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); @@ -5056,13 +5165,53 @@ inline env &env::set_sync_threshold(size_t bytes) { return *this; } -inline env &env::set_sync_period(unsigned seconds_16dot16) { +inline size_t env::sync_threshold() const { + size_t bytes; + error::success_or_throw(::mdbx_env_get_syncbytes(handle_, &bytes)); + return bytes; +} + +inline env &env::set_sync_period__seconds_16dot16(unsigned seconds_16dot16) { error::success_or_throw(::mdbx_env_set_syncperiod(handle_, seconds_16dot16)); return *this; } -inline env &env::set_sync_period(double seconds) { - return set_sync_period(unsigned(seconds * 65536)); +inline unsigned env::sync_period__seconds_16dot16() const { + unsigned seconds_16dot16; + error::success_or_throw(::mdbx_env_get_syncperiod(handle_, &seconds_16dot16)); + return seconds_16dot16; +} + +inline env &env::set_sync_period__seconds_double(double seconds) { + return set_sync_period__seconds_16dot16(unsigned(seconds * 65536)); +} + +inline double env::sync_period__seconds_double() const { + return sync_period__seconds_16dot16() / 65536.0; +} + +#if __cplusplus >= 201103L +inline env &env::set_sync_period(const duration &period) { + return set_sync_period__seconds_16dot16(period.count()); +} + +inline duration env::sync_period() const { + return duration(sync_period__seconds_16dot16()); +} +#endif + +inline env &env::set_extra_option(enum env::extra_runtime_option option, + uint64_t value) { + error::success_or_throw( + ::mdbx_env_set_option(handle_, ::MDBX_option_t(option), value)); + return *this; +} + +inline uint64_t env::extra_option(enum env::extra_runtime_option option) const { + uint64_t value; + error::success_or_throw( + ::mdbx_env_get_option(handle_, ::MDBX_option_t(option), &value)); + return value; } inline env &env::alter_flags(MDBX_env_flags_t flags, bool on_off) { diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c index ae2066463..74bde38a3 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_chk.c @@ -1,7 +1,7 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -109,27 +109,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -139,8 +143,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -157,7 +161,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -450,8 +454,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -729,17 +733,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -750,17 +750,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -826,6 +822,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -997,7 +1015,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1202,7 +1220,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1215,14 +1234,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1235,8 +1255,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1304,13 +1328,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1327,7 +1350,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1336,11 +1358,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1351,7 +1375,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1369,11 +1393,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1513,9 +1539,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1537,6 +1564,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1550,9 +1590,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1574,6 +1613,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1730,22 +1770,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1877,6 +1902,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1952,6 +2017,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1961,6 +2028,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1974,6 +2043,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1999,6 +2070,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2113,7 +2197,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2199,6 +2287,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2212,6 +2302,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2221,6 +2313,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2232,6 +2326,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2243,6 +2339,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2251,6 +2350,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2263,8 +2365,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2272,6 +2387,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2297,6 +2414,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2386,6 +2505,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2684,16 +2939,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2735,18 +2986,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2776,6 +3033,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2915,6 +3180,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2964,6 +3233,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3036,7 +3311,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3061,14 +3337,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3084,7 +3353,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3092,7 +3362,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3108,9 +3378,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3159,13 +3431,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3224,7 +3494,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3247,11 +3517,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3301,6 +3577,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3359,12 +3638,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3392,10 +3671,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3407,11 +3688,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3433,6 +3720,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3441,6 +3729,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3452,6 +3741,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3482,139 +3773,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3625,7 +3783,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3641,7 +3800,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3800,16 +3959,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3821,14 +3971,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -3912,6 +4062,8 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +#include + typedef struct flagbit { int bit; const char *name; @@ -4046,7 +4198,7 @@ static void signal_handler(int sig) { #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE typedef struct { - const char *name; + MDBX_val name; struct { uint64_t branch, large_count, large_volume, leaf; uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; @@ -4077,7 +4229,7 @@ uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; bool ignore_wrong_order, quiet, dont_traversal; -const char *only_subdb; +MDBX_val only_subdb; int stuck_meta = -1; struct problem { @@ -4100,6 +4252,97 @@ static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { } } +static MDBX_val printable_buf; +static void free_printable_buf(void) { osal_free(printable_buf.iov_base); } + +static const char *sdb_name(const MDBX_val *val) { + if (val == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (val == MDBX_PGWALK_GC) + return "@GC"; + if (val == MDBX_PGWALK_META) + return "@META"; + + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (data == MDBX_PGWALK_GC) + return "@GC"; + if (data == MDBX_PGWALK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + static char buf[64]; + /* NOTE: There is MSYS2 MinGW bug if you here got + * the "unknown conversion type character ‘z’ in format [-Werror=format=]" + * https://stackoverflow.com/questions/74504432/whats-the-proper-way-to-tell-mingw-based-gcc-to-use-ansi-stdio-output-on-windo + */ + snprintf(buf, sizeof(buf), "", len); + return buf; + } + + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < val->iov_len && printable; ++i) { + quoting |= data[i] != '_' && isalnum(data[i]) == 0; + printable = isprint(data[i]) != 0 || + (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); + } + + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > printable_buf.iov_len) { + void *ptr = osal_realloc(printable_buf.iov_base, need); + if (!ptr) + return ""; + if (!printable_buf.iov_base) + atexit(free_printable_buf); + printable_buf.iov_base = ptr; + printable_buf.iov_len = need; + } + + char *out = printable_buf.iov_base; + if (!quoting) { + memcpy(out, data, len); + out += len; + } else if (printable) { + *out++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 4); + static const char hex[] = "0123456789abcdef"; + out[0] = '\\'; + out[1] = 'x'; + out[2] = hex[data[i] >> 4]; + out[3] = hex[data[i] & 15]; + out += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 2); + out[0] = '\\'; + out[1] = data[i]; + out += 2; + } else { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 1); + *out++ = data[i]; + } + } + *out++ = '\''; + } + assert((char *)printable_buf.iov_base + printable_buf.iov_len > out); + *out = 0; + return printable_buf.iov_base; +} + static void va_log(MDBX_log_level_t level, const char *function, int line, const char *msg, va_list args) { static const char *const prefixes[] = { @@ -4165,19 +4408,17 @@ static int check_user_break(void) { } static void pagemap_cleanup(void) { - for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; - i < ARRAY_LENGTH(walk.dbi); ++i) { - if (walk.dbi[i].name) { - osal_free((void *)walk.dbi[i].name); - walk.dbi[i].name = nullptr; - } - } - osal_free(walk.pagemap); walk.pagemap = nullptr; } -static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { +static bool eq(const MDBX_val a, const MDBX_val b) { + return a.iov_len == b.iov_len && + (a.iov_base == b.iov_base || a.iov_len == 0 || + !memcmp(a.iov_base, b.iov_base, a.iov_len)); +} + +static walk_dbi_t *pagemap_lookup_dbi(const MDBX_val *dbi_name, bool silent) { static walk_dbi_t *last; if (dbi_name == MDBX_PGWALK_MAIN) @@ -4187,24 +4428,24 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (dbi_name == MDBX_PGWALK_META) return &dbi_meta; - if (last && strcmp(last->name, dbi_name) == 0) + if (last && eq(last->name, *dbi_name)) return last; walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; - for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { - if (strcmp(dbi->name, dbi_name) == 0) + for (; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + if (eq(dbi->name, *dbi_name)) return last = dbi; } if (verbose > 0 && !silent) { - print(" - found '%s' area\n", dbi_name); + print(" - found %s area\n", sdb_name(dbi_name)); fflush(nullptr); } if (dbi == ARRAY_END(walk.dbi)) return nullptr; - dbi->name = osal_strdup(dbi_name); + dbi->name = *dbi_name; return last = dbi; } @@ -4279,13 +4520,13 @@ static size_t problems_pop(struct problem *list) { } static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, - void *const ctx, const int deep, - const char *const dbi_name_or_tag, const size_t page_size, - const MDBX_page_type_t pagetype, const MDBX_error_t err, - const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) { + void *const ctx, const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t pagetype, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { (void)ctx; - const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC; + const bool is_gc_tree = dbi_name == MDBX_PGWALK_GC; if (deep > 42) { problem_add("deep", deep, "too large", nullptr); data_tree_problems += !is_gc_tree; @@ -4293,7 +4534,7 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, return MDBX_CORRUPTED /* avoid infinite loop/recursion */; } - walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false); + walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name, false); if (!dbi) { data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -4358,14 +4599,14 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, } if (pgnumber) { - if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) { + if (verbose > 3 && (!only_subdb.iov_base || eq(only_subdb, dbi->name))) { if (pgnumber == 1) print(" %s-page %" PRIu64, pagetype_caption, pgno); else print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i\n", - dbi->name, header_bytes, + sdb_name(&dbi->name), header_bytes, (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, payload_bytes, unused_bytes, deep); } @@ -4383,8 +4624,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; problem_add("page", spanpgno, (branch && coll_dbi == dbi) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, - deep); + "%s-page: by %s, deep %i", pagetype_caption, + sdb_name(&coll_dbi->name), deep); already_used = true; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -4455,8 +4696,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { - dbi->payload_bytes += payload_bytes + header_bytes; - walk.total_payload_bytes += payload_bytes + header_bytes; + dbi->payload_bytes += (uint64_t)payload_bytes + header_bytes; + walk.total_payload_bytes += (uint64_t)payload_bytes + header_bytes; } } } @@ -4466,8 +4707,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent); +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { @@ -4516,7 +4757,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; pgno_t span = 1; - for (unsigned i = 0; i < number; ++i) { + for (size_t i = 0; i < number; ++i) { if (check_user_break()) return MDBX_EINTR; const pgno_t pgno = iptr[i]; @@ -4535,7 +4776,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (MDBX_PNL_DISORDERED(prev, pgno)) { bad = " [bad sequence]"; problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, + "%" PRIaPGNO " %c [%zu].%" PRIaPGNO, prev, (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, pgno); } @@ -4545,7 +4786,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, walk.pagemap[pgno] = -1; else if (idx > 0) problem_add("page", pgno, "already used", "by %s", - walk.dbi[idx - 1].name); + sdb_name(&walk.dbi[idx - 1].name)); else problem_add("page", pgno, "already listed in GC", nullptr); } @@ -4556,12 +4797,12 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, : pgno_sub(pgno, span))) ++span; } - if (verbose > 3 && !only_subdb) { + if (verbose > 3 && !only_subdb.iov_base) { print(" transaction %" PRIaTXN ", %" PRIuPTR " pages, maxspan %" PRIaPGNO "%s\n", txnid, number, span, bad); if (verbose > 4) { - for (unsigned i = 0; i < number; i += span) { + for (size_t i = 0; i < number; i += span) { const pgno_t pgno = iptr[i]; for (span = 1; i + span < number && @@ -4583,36 +4824,18 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, } static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { - return (a->iov_len == b->iov_len && - memcmp(a->iov_base, b->iov_base, a->iov_len) == 0) - ? 0 - : 1; + return eq(*a, *b) ? 0 : 1; } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { - char *name; - int rc; - size_t i; - - name = key->iov_base; - for (i = 0; i < key->iov_len; ++i) { - if (name[i] < ' ') - return handle_userdb(record_number, key, data); + if (data->iov_len == sizeof(MDBX_db)) { + int rc = process_db(~0u, key, handle_userdb); + if (rc != MDBX_INCOMPATIBLE) { + userdb_count++; + return rc; + } } - - name = osal_malloc(key->iov_len + 1); - if (unlikely(!name)) - return MDBX_ENOMEM; - memcpy(name, key->iov_base, key->iov_len); - name[key->iov_len] = '\0'; - userdb_count++; - - rc = process_db(~0u, name, handle_userdb, false); - osal_free(name); - if (rc != MDBX_INCOMPATIBLE) - return rc; - return handle_userdb(record_number, key, data); } @@ -4666,8 +4889,8 @@ static const char *db_flags2valuemode(unsigned flags) { } } -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent) { +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; @@ -4676,18 +4899,19 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, int rc, i; struct problem *saved_list; uint64_t problems_count; + const bool second_pass = dbi_handle == MAIN_DBI; uint64_t record_count = 0, dups = 0; uint64_t key_bytes = 0, data_bytes = 0; if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { - print(" ! abort processing '%s' due to a previous error\n", - dbi_name ? dbi_name : "@MAIN"); + print(" ! abort processing %s due to a previous error\n", + sdb_name(dbi_name)); return MDBX_BAD_TXN; } if (dbi_handle == ~0u) { - rc = mdbx_dbi_open_ex( + rc = mdbx_dbi_open_ex2( txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); @@ -4695,27 +4919,26 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (!dbi_name || rc != MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error("mdbx_dbi_open('%s') failed, error %d %s\n", - dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc)); + error("mdbx_dbi_open(%s) failed, error %d %s\n", sdb_name(dbi_name), rc, + mdbx_strerror(rc)); } return rc; } } - if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && - strcmp(only_subdb, dbi_name) != 0) { + if (dbi_handle >= CORE_DBS && dbi_name && only_subdb.iov_base && + !eq(only_subdb, *dbi_name)) { if (verbose) { - print("Skip processing '%s'...\n", dbi_name); + print("Skip processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); } skipped_subdb++; return MDBX_SUCCESS; } - if (!silent && verbose) { - print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN"); - fflush(nullptr); - } + if (!second_pass && verbose) + print("Processing %s...\n", sdb_name(dbi_name)); + fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); if (rc) { @@ -4729,7 +4952,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, return rc; } - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), db_flags2valuemode(flags)); if (verbose > 1) { @@ -4805,57 +5028,75 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (rc) goto bailout; - bool bad_key = false; - if (key.iov_len > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); - bad_key = true; - } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && - key.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.iov_len); - bad_key = true; - } + if (!second_pass) { + bool bad_key = false; + if (key.iov_len > maxkeysize) { + problem_add("entry", record_count, "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && + key.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; + } - bool bad_data = false; - if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && - data.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.iov_len); - bad_data = true; - } - - if (prev_key.iov_base) { - if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && - prev_data.iov_len != data.iov_len) { - problem_add("entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, - data.iov_len); + bool bad_data = false; + if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && + data.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); bad_data = true; } - if (!bad_key) { - int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); - if (cmp == 0) { - ++dups; - if ((flags & MDBX_DUPSORT) == 0) { - problem_add("entry", record_count, "duplicated entries", nullptr); - if (prev_data.iov_base && data.iov_len == prev_data.iov_len && - memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } - } else if (!bad_data && prev_data.iov_base) { - cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); - if (cmp == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of multi-values", - nullptr); - } - } - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of entries", nullptr); + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + problem_add("entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; } + + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((flags & MDBX_DUPSORT) == 0) { + problem_add("entry", record_count, "duplicated entries", nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == + 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); + if (cmp == 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, + "wrong order of multi-values", nullptr); + } + } + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, "wrong order of entries", + nullptr); + } + } + } + + if (!bad_key) { + if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) + print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); + prev_key = key; + } + if (!bad_data) { + if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && + !prev_data.iov_base) + print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); + prev_data = data; } } @@ -4869,17 +5110,6 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, key_bytes += key.iov_len; data_bytes += data.iov_len; - if (!bad_key) { - if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) - print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); - prev_key = key; - } - if (!bad_data) { - if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && - !prev_data.iov_base) - print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); - prev_data = data; - } rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); } if (rc != MDBX_NOTFOUND) @@ -4892,7 +5122,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); bailout: problems_count = problems_pop(saved_list); - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 " key's bytes, %" PRIu64 " data's " "bytes, %" PRIu64 " problems\n", @@ -5077,9 +5307,9 @@ int main(int argc, char *argv[]) { } #endif - dbi_meta.name = "@META"; - dbi_free.name = "@GC"; - dbi_main.name = "@MAIN"; + dbi_meta.name.iov_base = MDBX_PGWALK_META; + dbi_free.name.iov_base = MDBX_PGWALK_GC; + dbi_main.name.iov_base = MDBX_PGWALK_MAIN; atexit(pagemap_cleanup); if (argc < 2) @@ -5146,7 +5376,7 @@ int main(int argc, char *argv[]) { envflags &= ~MDBX_RDONLY; #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ envflags |= MDBX_WRITEMAP; #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ break; @@ -5157,9 +5387,10 @@ int main(int argc, char *argv[]) { dont_traversal = true; break; case 's': - if (only_subdb && strcmp(only_subdb, optarg)) + if (only_subdb.iov_base && strcmp(only_subdb.iov_base, optarg)) usage(prog); - only_subdb = optarg; + only_subdb.iov_base = optarg; + only_subdb.iov_len = strlen(optarg); break; case 'i': ignore_wrong_order = true; @@ -5197,9 +5428,10 @@ int main(int argc, char *argv[]) { error("write-mode must be enabled to turn to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } - if (only_subdb || dont_traversal) { - error("whole database checking with tree-traversal are required to turn " - "to the specified meta-page.\n"); + if (only_subdb.iov_base || dont_traversal) { + error( + "whole database checking with b-tree traversal are required to turn " + "to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } } @@ -5390,7 +5622,7 @@ int main(int argc, char *argv[]) { alloc_pages = backed_pages; } } else { - /* LY: DB may be shrinked by writer down to the allocated pages. */ + /* LY: DB may be shrunk by writer down to the allocated pages. */ if (alloc_pages > backed_pages) { print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", alloc_pages, backed_pages); @@ -5538,8 +5770,8 @@ int main(int argc, char *argv[]) { unused_pages += 1; empty_pages = lost_bytes = 0; - for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { + for (walk_dbi_t *dbi = &dbi_main; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { empty_pages += dbi->pages.empty; lost_bytes += dbi->lost_bytes; } @@ -5549,9 +5781,10 @@ int main(int argc, char *argv[]) { print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", walk.pgcount, unused_pages); if (verbose > 1) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { - print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total); + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + print(" %s: subtotal %" PRIu64, sdb_name(&dbi->name), + dbi->pages.total); if (dbi->pages.other && dbi->pages.other != dbi->pages.total) print(", other %" PRIu64, dbi->pages.other); if (dbi->pages.branch) @@ -5583,14 +5816,15 @@ int main(int argc, char *argv[]) { (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); if (verbose > 2) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) if (dbi->pages.total) { uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", - dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, - dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, + sdb_name(&dbi->name), dbi_bytes, + dbi_bytes * 100.0 / total_page_bytes, dbi->payload_bytes, + dbi->payload_bytes * 100.0 / dbi_bytes, dbi_bytes - dbi->payload_bytes, (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); if (dbi->pages.empty) @@ -5599,7 +5833,7 @@ int main(int argc, char *argv[]) { print(", %" PRIu64 " bytes lost", dbi->lost_bytes); print("\n"); } else - print(" %s: empty\n", dbi->name); + print(" %s: empty\n", sdb_name(&dbi->name)); } print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); @@ -5614,21 +5848,12 @@ int main(int argc, char *argv[]) { fflush(nullptr); } - if (!verbose) - print("Iterating DBIs...\n"); - if (data_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN", - data_tree_problems); - problems_maindb = data_tree_problems; - } else - problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); - if (gc_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC", - gc_tree_problems); + print("Skip processing %s since %s is corrupted (%u problems)\n", "@GC", + "b-tree", gc_tree_problems); problems_freedb = gc_tree_problems; } else - problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false); + problems_freedb = process_db(FREE_DBI, MDBX_PGWALK_GC, handle_freedb); if (verbose) { uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; @@ -5660,7 +5885,7 @@ int main(int argc, char *argv[]) { print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } - if (problems_maindb == 0 && problems_freedb == 0) { + if ((problems_maindb = data_tree_problems) == 0 && problems_freedb == 0) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (walk.pgcount != alloc_pages - gc_pages) { @@ -5669,22 +5894,32 @@ int main(int argc, char *argv[]) { walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { - error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", + error("GC pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", unused_pages, gc_pages); } } else if (verbose) { - print(" - skip check used and gc pages (btree-traversal with " + print(" - skip check used and GC pages (btree-traversal with " "monopolistic or read-write mode only)\n"); } - if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { - if (!userdb_count && verbose) - print(" - does not contain multiple databases\n"); + problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr); + if (problems_maindb == 0) { + print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); + if (!process_db(MAIN_DBI, nullptr, handle_maindb)) { + if (!userdb_count && verbose) + print(" - does not contain multiple databases\n"); + } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", + "sub-database(s)", "@MAIN", problems_maindb); } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", + "b-tree", data_tree_problems); } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && - (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 && + (envflags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && stuck_meta < 0 && get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 "\n", @@ -5703,7 +5938,7 @@ int main(int argc, char *argv[]) { } } - if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb && + if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb.iov_base && (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { const bool successful_check = (rc | total_problems | problems_meta) == 0; if (successful_check || force_turn_meta) { diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c index 37a804b79..b9bf2d934 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -109,27 +109,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -139,8 +143,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -157,7 +161,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -450,8 +454,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -729,17 +733,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -750,17 +750,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -826,6 +822,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -997,7 +1015,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1202,7 +1220,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1215,14 +1234,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1235,8 +1255,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1304,13 +1328,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1327,7 +1350,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1336,11 +1358,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1351,7 +1375,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1369,11 +1393,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1513,9 +1539,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1537,6 +1564,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1550,9 +1590,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1574,6 +1613,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1730,22 +1770,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1877,6 +1902,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1952,6 +2017,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1961,6 +2028,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1974,6 +2043,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1999,6 +2070,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2113,7 +2197,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2199,6 +2287,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2212,6 +2302,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2221,6 +2313,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2232,6 +2326,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2243,6 +2339,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2251,6 +2350,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2263,8 +2365,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2272,6 +2387,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2297,6 +2414,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2386,6 +2505,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2684,16 +2939,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2735,18 +2986,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2776,6 +3033,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2915,6 +3180,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2964,6 +3233,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3036,7 +3311,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3061,14 +3337,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3084,7 +3353,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3092,7 +3362,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3108,9 +3378,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3159,13 +3431,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3224,7 +3494,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3247,11 +3517,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3301,6 +3577,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3359,12 +3638,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3392,10 +3671,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3407,11 +3688,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3433,6 +3720,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3441,6 +3729,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3452,6 +3741,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3482,139 +3773,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3625,7 +3783,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3641,7 +3800,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3800,16 +3959,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3821,14 +3971,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c index 491e5cb4a..3f232625f 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_drop.c @@ -1,10 +1,10 @@ /* mdbx_drop.c - memory-mapped database delete tool */ /* - * Copyright 2021 Leonid Yuriev + * Copyright 2021-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * - * Copyright 2016-2022 Howard Chu, Symas Corp. + * Copyright 2016-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,7 +24,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -36,7 +36,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -111,27 +111,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -141,8 +145,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -159,7 +163,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -452,8 +456,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -731,17 +735,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -752,17 +752,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -828,6 +824,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -999,7 +1017,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1204,7 +1222,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1217,14 +1236,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1237,8 +1257,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1306,13 +1330,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1329,7 +1352,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1338,11 +1360,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1353,7 +1377,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1371,11 +1395,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1515,9 +1541,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1539,6 +1566,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1552,9 +1592,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1576,6 +1615,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1732,22 +1772,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1879,6 +1904,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1954,6 +2019,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1963,6 +2030,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1976,6 +2045,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -2001,6 +2072,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2115,7 +2199,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2201,6 +2289,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2214,6 +2304,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2223,6 +2315,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2234,6 +2328,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2245,6 +2341,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2253,6 +2352,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2265,8 +2367,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2274,6 +2389,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2299,6 +2416,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2388,6 +2507,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2686,16 +2941,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2737,18 +2988,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2778,6 +3035,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2917,6 +3182,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2966,6 +3235,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3038,7 +3313,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3063,14 +3339,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3086,7 +3355,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3094,7 +3364,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3110,9 +3380,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3161,13 +3433,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3226,7 +3496,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3249,11 +3519,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3303,6 +3579,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3361,12 +3640,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3394,10 +3673,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3409,11 +3690,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3435,6 +3722,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3443,6 +3731,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3454,6 +3743,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3484,139 +3775,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3627,7 +3785,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3643,7 +3802,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3802,16 +3961,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3823,14 +3973,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c index 7c03b3f9c..5cc90c8dc 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -109,27 +109,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -139,8 +143,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -157,7 +161,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -450,8 +454,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -729,17 +733,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -750,17 +750,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -826,6 +822,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -997,7 +1015,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1202,7 +1220,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1215,14 +1234,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1235,8 +1255,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1304,13 +1328,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1327,7 +1350,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1336,11 +1358,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1351,7 +1375,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1369,11 +1393,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1513,9 +1539,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1537,6 +1564,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1550,9 +1590,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1574,6 +1613,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1730,22 +1770,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1877,6 +1902,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1952,6 +2017,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1961,6 +2028,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1974,6 +2043,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1999,6 +2070,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2113,7 +2197,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2199,6 +2287,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2212,6 +2302,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2221,6 +2313,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2232,6 +2326,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2243,6 +2339,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2251,6 +2350,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2263,8 +2365,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2272,6 +2387,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2297,6 +2414,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2386,6 +2505,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2684,16 +2939,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2735,18 +2986,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2776,6 +3033,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2915,6 +3180,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2964,6 +3233,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3036,7 +3311,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3061,14 +3337,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3084,7 +3353,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3092,7 +3362,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3108,9 +3378,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3159,13 +3431,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3224,7 +3494,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3247,11 +3517,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3301,6 +3577,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3359,12 +3638,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3392,10 +3671,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3407,11 +3688,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3433,6 +3720,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3441,6 +3729,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3452,6 +3741,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3482,139 +3773,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3625,7 +3783,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3641,7 +3800,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3800,16 +3959,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3821,14 +3971,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -4041,7 +4191,7 @@ static const char hexc[] = "0123456789abcdef"; static void dumpbyte(unsigned char c) { putchar(hexc[c >> 4]); - putchar(hexc[c & 0xf]); + putchar(hexc[c & 15]); } static void text(MDBX_val *v) { diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c index 18809c420..0cca77155 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -109,27 +109,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -139,8 +143,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -157,7 +161,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -450,8 +454,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -729,17 +733,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -750,17 +750,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -826,6 +822,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -997,7 +1015,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1202,7 +1220,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1215,14 +1234,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1235,8 +1255,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1304,13 +1328,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1327,7 +1350,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1336,11 +1358,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1351,7 +1375,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1369,11 +1393,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1513,9 +1539,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1537,6 +1564,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1550,9 +1590,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1574,6 +1613,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1730,22 +1770,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1877,6 +1902,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1952,6 +2017,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1961,6 +2028,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1974,6 +2043,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1999,6 +2070,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2113,7 +2197,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2199,6 +2287,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2212,6 +2302,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2221,6 +2313,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2232,6 +2326,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2243,6 +2339,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2251,6 +2350,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2263,8 +2365,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2272,6 +2387,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2297,6 +2414,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2386,6 +2505,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2684,16 +2939,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2735,18 +2986,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2776,6 +3033,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2915,6 +3180,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2964,6 +3233,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3036,7 +3311,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3061,14 +3337,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3084,7 +3353,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3092,7 +3362,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3108,9 +3378,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3159,13 +3431,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3224,7 +3494,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3247,11 +3517,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3301,6 +3577,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3359,12 +3638,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3392,10 +3671,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3407,11 +3688,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3433,6 +3720,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3441,6 +3729,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3452,6 +3741,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3482,139 +3773,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3625,7 +3783,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3641,7 +3800,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3800,16 +3959,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3821,14 +3971,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -4648,7 +4798,7 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + 1; + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + (size_t)1; if (kbuf.iov_len >= INTPTR_MAX / 2) { if (!quiet) fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", diff --git a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c index ddf455e3e..0aabb5766 100644 --- a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c +++ b/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0 +#define MDBX_BUILD_SOURCERY a0e7c54f688eecaf45ddd7493b737f88a97e4e8b0fdaa55c9d3b00d69e0c8548_v0_12_6_0_gc019631a #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -109,27 +109,31 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ +#endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ #endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ -#pragma warning( \ - disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4201) /* nonstandard extension used: nameless \ + struct/union */ #pragma warning(disable : 4702) /* unreachable code */ #pragma warning(disable : 4706) /* assignment within conditional expression */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ -#pragma warning( \ - disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4820) /* bytes padding added after data member for \ + alignment */ #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ @@ -139,8 +143,8 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" @@ -157,7 +161,7 @@ #include "mdbx.h" /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -450,8 +454,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ @@ -729,17 +733,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -750,17 +750,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif @@ -826,6 +822,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) @@ -997,7 +1015,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1202,7 +1220,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is @@ -1215,14 +1234,15 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -1235,8 +1255,12 @@ typedef struct osal_mmap_param { } osal_mmap_t; typedef union bin128 { - __anonymous_struct_extension__ struct { uint64_t x, y; }; - __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; } bin128_t; #if defined(_WIN32) || defined(_WIN64) @@ -1304,13 +1328,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -1327,7 +1350,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -1336,11 +1358,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -1351,7 +1375,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -1369,11 +1393,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif @@ -1513,9 +1539,10 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, @@ -1537,6 +1564,19 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, @@ -1550,9 +1590,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 @@ -1574,6 +1613,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); @@ -1730,22 +1770,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, @@ -1877,6 +1902,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ @@ -1952,6 +2017,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -1961,6 +2028,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -1974,6 +2043,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -1999,6 +2070,19 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT @@ -2113,7 +2197,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes @@ -2199,6 +2287,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -2212,6 +2302,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -2221,6 +2313,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -2232,6 +2326,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -2243,6 +2339,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -2251,6 +2350,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -2263,8 +2365,21 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -2272,6 +2387,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -2297,6 +2414,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -2386,6 +2505,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -2684,16 +2939,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -2735,18 +2986,24 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) + #pragma pack(pop) typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, @@ -2776,6 +3033,14 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ @@ -2915,6 +3180,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -2964,6 +3233,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ @@ -3036,7 +3311,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -3061,14 +3337,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ @@ -3084,7 +3353,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -3092,7 +3362,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) @@ -3108,9 +3378,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif @@ -3159,13 +3431,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -3224,7 +3494,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; @@ -3247,11 +3517,17 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -3301,6 +3577,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ @@ -3359,12 +3638,12 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -3392,10 +3671,12 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -3407,11 +3688,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -3433,6 +3720,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -3441,6 +3729,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; @@ -3452,6 +3741,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif @@ -3482,139 +3773,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ @@ -3625,7 +3783,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -3641,7 +3800,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -3800,16 +3959,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -3821,14 +3971,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -4231,6 +4381,17 @@ int main(int argc, char *argv[]) { printf(" WOP: %8" PRIu64 "\t// number of explicit write operations (not a pages) to a disk\n", mei.mi_pgop_stat.wops); + printf(" PreFault: %8" PRIu64 + "\t// number of prefault write operations (not a pages)\n", + mei.mi_pgop_stat.prefault); + printf(" mInCore: %8" PRIu64 "\t// number of mincore() calls\n", + mei.mi_pgop_stat.mincore); + printf(" mSync: %8" PRIu64 + "\t// number of explicit msync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.msync); + printf(" fSync: %8" PRIu64 + "\t// number of explicit fsync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.fsync); } if (envinfo) {