fix(db): upgrade to libmdbx 0.12.2; (#377)

This commit is contained in:
GalaIO
2022-12-14 01:52:14 +08:00
committed by GitHub
parent 0fa7d5c29b
commit d7f3581e51
24 changed files with 8160 additions and 3161 deletions

View File

@ -34,7 +34,9 @@
## The Future will (be) Positive. Всё будет хорошо.
##
if(CMAKE_VERSION VERSION_LESS 3.12)
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)
elseif(CMAKE_VERSION VERSION_LESS 3.12)
cmake_minimum_required(VERSION 3.8.2)
else()
cmake_minimum_required(VERSION 3.12)
@ -241,34 +243,42 @@ else()
option(BUILD_FOR_NATIVE_CPU "Generate code for the compiling machine CPU" OFF)
endif()
if(CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG")
set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON)
else()
set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF)
endif()
if(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE
OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR
(CLANG_LTO_AVAILABLE AND
((DEFINED MDBX_ENABLE_TESTS AND NOT MDBX_ENABLE_TESTS)
OR NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)))
option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization" ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT})
else()
set(INTERPROCEDURAL_OPTIMIZATION OFF)
OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR CLANG_LTO_AVAILABLE)
if((CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") AND
((MSVC_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 19) OR
(GCC_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 7) OR
(CLANG_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 5)))
set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON)
else()
set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF)
endif()
option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization." ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT})
endif()
if(INTERPROCEDURAL_OPTIMIZATION)
if(GCC_LTO_AVAILABLE)
set(LTO_ENABLED TRUE)
set(CMAKE_AR ${CMAKE_GCC_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_CXX_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_NM ${CMAKE_GCC_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE)
set(CMAKE_RANLIB ${CMAKE_GCC_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
set(CMAKE_C_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
set(CMAKE_CXX_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
message(STATUS "MDBX indulge Link-Time Optimization by GCC")
elseif(CLANG_LTO_AVAILABLE)
set(LTO_ENABLED TRUE)
if(CMAKE_CLANG_LD)
set(CMAKE_LINKER ${CMAKE_CLANG_LD} CACHE PATH "Path to lld or ld program with LTO-plugin" FORCE)
endif()
set(CMAKE_AR ${CMAKE_CLANG_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_CXX_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE)
set(CMAKE_NM ${CMAKE_CLANG_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE)
set(CMAKE_RANLIB ${CMAKE_CLANG_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
set(CMAKE_C_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
set(CMAKE_CXX_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE)
message(STATUS "MDBX indulge Link-Time Optimization by CLANG")
elseif(MSVC_LTO_AVAILABLE)
set(LTO_ENABLED TRUE)
@ -375,6 +385,8 @@ if(NOT DEFINED MDBX_CXX_STANDARD)
set(MDBX_CXX_STANDARD 14)
elseif(NOT HAS_CXX11 LESS 0)
set(MDBX_CXX_STANDARD 11)
elseif(CXX_FALLBACK_GNU11 OR CXX_FALLBACK_11)
set(MDBX_CXX_STANDARD 11)
else()
set(MDBX_CXX_STANDARD 98)
endif()
@ -494,16 +506,29 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if(MDBX_NTDLL_EXTRA_IMPLIB)
add_mdbx_option(MDBX_WITHOUT_MSVC_CRT "Avoid dependence from MSVC CRT and use ntdll.dll instead" OFF)
endif()
set(MDBX_AVOID_MSYNC_DEFAULT ON)
else()
add_mdbx_option(MDBX_USE_OFDLOCKS "Use Open file description locks (aka OFD locks, non-POSIX)" AUTO)
mark_as_advanced(MDBX_USE_OFDLOCKS)
set(MDBX_AVOID_MSYNC_DEFAULT OFF)
endif()
add_mdbx_option(MDBX_LOCKING "Locking method (Win32=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO)
option(MDBX_AVOID_MSYNC "Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP mode" ${MDBX_AVOID_MSYNC_DEFAULT})
add_mdbx_option(MDBX_LOCKING "Locking method (Windows=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO)
mark_as_advanced(MDBX_LOCKING)
add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake" AUTO)
mark_as_advanced(MDBX_TRUST_RTC)
option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking" OFF)
option(MDBX_DISABLE_VALIDATION "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF)
option(MDBX_ENABLE_REFUND "Zerocost auto-compactification during write-transactions" ON)
option(MDBX_ENABLE_MADVISE "Using POSIX' madvise() and/or similar hints" ON)
if (CMAKE_TARGET_BITNESS GREATER 32)
set(MDBX_BIGFOOT_DEFAULT ON)
else()
set(MDBX_BIGFOOT_DEFAULT OFF)
endif()
option(MDBX_ENABLE_BIGFOOT "Chunking long list of retired pages during huge transactions commit to avoid use sequences of pages" ${MDBX_BIGFOOT_DEFAULT})
option(MDBX_ENABLE_PGOP_STAT "Gathering statistics for page operations" ON)
option(MDBX_ENABLE_PROFGC "Profiling of GC search and updates" OFF)
if(NOT MDBX_AMALGAMATED_SOURCE)
if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG")
@ -520,7 +545,7 @@ else()
unset(MDBX_LINK_TOOLS_NONSTATIC CACHE)
endif()
if(CMAKE_CXX_COMPILER_LOADED AND MDBX_CXX_STANDARD GREATER_EQUAL 11 AND MDBX_CXX_STANDARD LESS 83)
if(CMAKE_CXX_COMPILER_LOADED AND MDBX_CXX_STANDARD LESS 83 AND NOT MDBX_CXX_STANDARD LESS 11)
if(NOT MDBX_AMALGAMATED_SOURCE)
option(MDBX_ENABLE_TESTS "Build MDBX tests" ${BUILD_TESTING})
endif()
@ -626,9 +651,13 @@ macro(target_setup_options TARGET)
endmacro()
macro(libmdbx_setup_libs TARGET MODE)
target_link_libraries(${TARGET} ${MODE} Threads::Threads)
if(CMAKE_VERSION VERSION_LESS 3.1)
target_link_libraries(${TARGET} ${MODE} ${CMAKE_THREAD_LIBS_INIT})
else()
target_link_libraries(${TARGET} ${MODE} Threads::Threads)
endif()
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
target_link_libraries(${TARGET} ${MODE} ntdll advapi32)
target_link_libraries(${TARGET} ${MODE} ntdll user32 kernel32 advapi32)
if(MDBX_NTDLL_EXTRA_IMPLIB AND MDBX_WITHOUT_MSVC_CRT)
target_link_libraries(${TARGET} ${MODE} ntdll_extra)
endif()
@ -749,7 +778,6 @@ if(MDBX_BUILD_SHARED_LIBRARY)
if(CMAKE_VERSION VERSION_LESS 3.12)
install(TARGETS mdbx EXPORT libmdbx
LIBRARY DESTINATION ${MDBX_DLL_INSTALL_DESTINATION} COMPONENT runtime
OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel)
@ -894,16 +922,16 @@ else()
endif()
if(CMAKE_C_COMPILER_ABI
AND NOT (CMAKE_C_COMPILER_ABI MATCHES ".*${MDBX_BUILD_TARGET}.*" OR MDBX_BUILD_TARGET MATCHES ".*${CMAKE_C_COMPILER_ABI}.*"))
string(APPEND MDBX_BUILD_TARGET "-${CMAKE_C_COMPILER_ABI}")
string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_C_COMPILER_ABI}")
endif()
if(CMAKE_C_PLATFORM_ID
AND NOT (CMAKE_SYSTEM_NAME
AND (CMAKE_C_PLATFORM_ID MATCHES ".*${CMAKE_SYSTEM_NAME}.*" OR CMAKE_SYSTEM_NAME MATCHES ".*${CMAKE_C_PLATFORM_ID}.*"))
AND NOT (CMAKE_C_PLATFORM_ID MATCHES ".*${CMAKE_C_PLATFORM_ID}.*" OR MDBX_BUILD_TARGET MATCHES ".*${CMAKE_C_PLATFORM_ID}.*"))
string(APPEND MDBX_BUILD_TARGET "-${CMAKE_C_COMPILER_ABI}")
string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_C_COMPILER_ABI}")
endif()
if(CMAKE_SYSTEM_NAME)
string(APPEND MDBX_BUILD_TARGET "-${CMAKE_SYSTEM_NAME}")
string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_SYSTEM_NAME}")
endif()
endif()
@ -957,6 +985,7 @@ if (NOT SUBPROJECT)
set(CPACK_PACKAGE_VERSION_COMMIT ${MDBX_VERSION_REVISION})
set(PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${CPACK_PACKAGE_VERSION_COMMIT}")
message(STATUS "libmdbx package version is ${PACKAGE_VERSION}")
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/VERSION.txt" "${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE}.${MDBX_VERSION_REVISION}")
endif()
cmake_policy(POP)

View File

@ -1,6 +1,179 @@
ChangeLog
---------
English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en)
and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md).
## v0.12.2 (Иван Ярыгин) от 2022-11-11
Выпуск с существенными доработками и новой функциональностью
в память о российском борце [Иване Сергеевиче Ярыгине](https://ru.wikipedia.org/wiki/Ярыгин,_Иван_Сергеевич).
На Олимпийских играх в Мюнхене в 1972 году Иван Ярыгин уложил всех соперников на лопатки,
суммарно затратив менее 9 минут. Этот рекорд никем не побит до сих пор.
```
64 files changed, 5573 insertions(+), 2510 deletions(-)
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
```
Новое:
- Поддержка всех основных опций при сборке посредством CMake.
- Требования к CMake понижены до версии 3.0.2 для возможности сборки для устаревших платформ.
- Добавлена возможность профилирования работы GC в сложных и/или нагруженных
сценариях (например Ethereum/Erigon). По-умолчанию соответствующий код отключен,
а для его активации необходимо указать опцию сборки `MDBX_ENABLE_PROFGC=1`.
- Добавлена функция `mdbx_env_warmup()` для "прогрева" БД с возможностью
закрепления страниц в памяти.
В утилиты `mdbx_chk`, `mdbx_copy` и `mdbx_dump` добавлены опции `-u` и `-U`
для активации соответствующего функционала.
- Отключение учета «грязных» страниц в не требующих этого режимах
(`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить
накладные расходы и была запланирована давно, но откладывалась так как
требовала других изменений.
- Вытеснение из памяти (спиллинг) «грязных» страниц с учетом размера
large/overflow-страниц. Доработка позволяет корректно соблюдать политику
задаваемую опциями `MDBX_opt_txn_dp_limit`,
`MDBX_opt_spill_max_denominator`, `MDBX_opt_spill_min_denominator` и
была запланирована давно, но откладывалась так как требовала других
изменений.
- Для Windows в API добавлены UNICODE-зависимые определения макросов
`MDBX_DATANAME`, `MDBX_LOCKNAME` и `MDBX_LOCK_SUFFIX`.
- Переход на преимущественное использование типа `size_t` для
уменьшения накладных расходов на платформе Эльбрус.
- В API добавлены функции `mdbx_limits_valsize4page_max()` и
`mdbx_env_get_valsize4page_max()` возвращающие максимальный размер в
байтах значения, которое может быть размещена в одной
large/overflow-странице, а не последовательности из двух или более таких
страниц. Для таблиц с поддержкой дубликатов вынос значений на
large/overflow-страницы не поддерживается, поэтому результат совпадает с
`mdbx_limits_valsize_max()`.
- В API добавлены функции `mdbx_limits_pairsize4page_max()`и
`mdbx_env_get_pairsize4page_max()` возвращающие в байтах максимальный
суммарный размер пары ключ-значение для их размещения на одной листовой
страницы, без выноса значения на отдельную large/overflow-страницу. Для
таблиц с поддержкой дубликатов вынос значений на large/overflow-страницы
не поддерживается, поэтому результат определяет максимальный/допустимый
суммарный размер пары ключ-значение.
- Реализовано использование асинхронной (overlapped) записи в Windows,
включая использования небуфферизированного ввода-вывода и `WriteGather()`.
Это позволяет сократить накладные расходы и частично обойти проблемы
Windows с низкой производительностью ввода-вывода, включая большие
задержки `FlushFileBuffers()`. Новый код также обеспечивает консолидацию
записываемых регионов на всех платформах, а на Windows использование
событий (events) сведено к минимум, одновременно с автоматических
использованием `WriteGather()`. Поэтому ожидается существенное снижение
накладных расходов взаимодействия с ОС, а в Windows это ускорение, в
некоторых сценариях, может быть кратным в сравнении с LMDB.
- Добавлена опция сборки `MDBX_AVOID_MSYNC`, которая определяет
поведение libmdbx в режиме `MDBX_WRITE_MAP` (когда данные изменяются
непосредственно в отображенных в ОЗУ страницах БД):
* Если `MDBX_AVOID_MSYNC=0` (по умолчанию на всех системах кроме Windows),
то (как прежде) сохранение данных выполняется посредством `msync()`,
либо `FlushViewOfFile()` на Windows. На платформах с полноценной
подсистемой виртуальной памяти и адекватным файловым вводом-выводом
это обеспечивает минимум накладных расходов (один системный вызов)
и максимальную производительность. Однако, на Windows приводит
к значительной деградации, в том числе из-за того что после
`FlushViewOfFile()` требуется также вызов `FlushFileBuffers()`
с массой проблем и суеты внутри ядра ОС.
* Если `MDBX_AVOID_MSYNC=1` (по умолчанию только на Windows), то
сохранение данных выполняется явной записью в файл каждой измененной
страницы БД. Это требует дополнительных накладных расходов, как
на отслеживание измененных страниц (ведение списков "грязных"
страниц), так и на системные вызовы для их записи.
Кроме этого, с точки зрения подсистемы виртуальной памяти ядра ОС,
страницы БД измененные в ОЗУ и явно записанные в файл, могут либо
оставаться "грязными" и быть повторно записаны ядром ОС позже,
либо требовать дополнительных накладных расходов для отслеживания
PTE (Page Table Entries), их модификации и дополнительного копирования
данных. Тем не менее, по имеющейся информации, на Windows такой путь
записи данных в целом обеспечивает более высокую производительность.
- Улучшение эвристики включения авто-слияния записей GC.
- Изменение формата LCK и семантики некоторых внутренних полей. Версии
libmdbx использующие разный формат не смогут работать с одной БД
одновременно, а только поочередно (LCK-файл переписывается при открытии
первым открывающим БД процессом).
- В `C++` API добавлены методы фиксации транзакции с получением информации
о задержках.
- Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's
`__builtin_cpu_supports()` function, which could be unavailable on a fake
OSes (macos, ios, android, etc).
Исправления (без корректировок вышеперечисленных новых функций):
- Устранения ряда предупреждений при сборке посредством MinGW.
- Устранение ложно-положительных сообщений от Valgrind об использовании
не инициализированных данных из-за выравнивающих зазоров в `struct troika`.
- Исправлен возврат неожиданной ошибки `MDBX_BUSY` из функций `mdbx_env_set_option()`,
`mdbx_env_set_syncbytes()` и `mdbx_env_set_syncperiod()`.
- Небольшие исправления для совместимости с CMake 3.8
- Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`.
- Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C,
где макросы `ATOMIC_*_LOCK_FREE` ошибочно переопределяются через функции.
- Использование `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` при наличии.
Это решает проблему срабатывания проверочного утверждения при сборке для
платформ где тип `off_t` шире соответствующих полей `структуры flock`,
используемой для блокировки файлов.
- Доработан сбор информации о задержках при фиксации транзакций:
* Устранено искажение замеров длительности обновления GC
при включении отладочного внутреннего аудита;
* Защита от undeflow-нуля только общей задержки в метриках,
чтобы исключить ситуации, когда сумма отдельных стадий
больше общей длительности.
- Ряд исправлений для устранения срабатываний проверочных утверждения в
отладочных сборках.
- Более осторожное преобразование к типу `mdbx_tid_t` для устранения
предупреждений.
- Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC`
при обновлении GC.
- Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()`
which could result in returning `MDBX_EKEYMISMATCH` for valid cases.
- Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected).
Мелочи:
- Исторические ссылки cвязанные с удалённым на ~~github~~ проектом перенаправлены на [web.archive.org](https://web.archive.org/web/https://github.com/erthink/libmdbx).
- Синхронизированны конструкции CMake между проектами.
- Добавлено предупреждение о небезопасности RISC-V.
- Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`.
- Добавлено обходное решение для минимизации ложно-положительных
конфликтов при использовании файловых блокировок в Windows.
- Проверка атомарности C11-операций c 32/64-битными данными.
- Уменьшение в 42 раза значения по-умолчанию для `me_options.dp_limit`
в отладочных сборках.
- Добавление платформы `gcc-riscv64-linux-gnu` в список для цели `cross-gcc`.
- Небольшие правки скрипта `long_stochastic.sh` для работы в Windows.
- Удаление ненужного вызова `LockFileEx()` внутри `mdbx_env_copy()`.
- Добавлено описание использования файловых дескрипторов в различных режимах.
- Добавлено использование `_CrtDbgReport()` в отладочных сборках.
- Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`.
- Removed description of deprecated usage of `MDBX_NODUPDATA`.
- Fixed regression ASAN/Valgring-enabled builds.
- Fixed minor MingGW warning.
-------------------------------------------------------------------------------
## v0.12.1 (Positive Proxima) at 2022-08-24
The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com).
@ -46,10 +219,75 @@ Not a release but preparation for changing feature set and API.
-------------------------------------------------------------------------------
## v0.11.13 at (Swashplate) 2022-11-10
The stable bugfix release in memory of [Boris Yuryev](https://ru.wikipedia.org/wiki/Юрьев,_Борис_Николаевич) on his 133rd birthday.
```
30 files changed, 405 insertions(+), 136 deletions(-)
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
```
Fixes:
- Fixed builds with older libc versions after using `fcntl64()` (backport).
- Fixed builds with older `stdatomic.h` versions,
where the `ATOMIC_*_LOCK_FREE` macros mistakenly redefined using functions (backport).
- Added workaround for `mremap()` defect to avoid assertion failure (backport).
- Workaround for `encryptfs` bug(s) in the `copy_file_range` implementation (backport).
- Fixed unexpected `MDBX_BUSY` from `mdbx_env_set_option()`, `mdbx_env_set_syncbytes()`
and `mdbx_env_set_syncperiod()` (backport).
- CMake requirements lowered to version 3.0.2 (backport).
Minors:
- Minor clarification output of `--help` for `mdbx_test` (backport).
- Added admonition of insecure for RISC-V (backport).
- Stochastic scripts and CMake files synchronized with the `devel` branch.
- Use `--dont-check-ram-size` for small-tests make-targets (backport).
## v0.11.12 (Эребуни) at 2022-10-12
The stable bugfix release.
```
11 files changed, 96 insertions(+), 49 deletions(-)
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
```
Fixes:
- Fixed static assertion failure on platforms where the `off_t` type is wider
than corresponding fields of `struct flock` used for file locking (backport).
Now _libmdbx_ will use `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` if available.
- Fixed assertion check inside `page_retire_ex()` (backport).
Minors:
- Fixed `-Wint-to-pointer-cast` warnings while casting to `mdbx_tid_t` (backport).
- Removed needless `LockFileEx()` inside `mdbx_env_copy()` (backport).
## v0.11.11 (Тендра-1790) at 2022-09-11
The stable bugfix release.
```
10 files changed, 38 insertions(+), 21 deletions(-)
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
```
Fixes:
- Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases.
- Fixed an extra ensure/assertion check of `oldest_reader` inside `mdbx_txn_end()`.
- Fixed derived C++ builds by removing `MDBX_INTERNAL_FUNC` for `mdbx_w2mb()` and `mdbx_mb2w()`.
## v0.11.10 (the TriColor) at 2022-08-22
The stable bugfix release.
It is planned that this will be the last release of the v0.11 branch.
```
14 files changed, 263 insertions(+), 252 deletions(-)
@ -76,7 +314,6 @@ Minors:
- Use current transaction geometry for untouched parameters when `env_set_geometry()` called within a write transaction.
- Minor clarified `iov_page()` failure case.
-------------------------------------------------------------------------------
## v0.11.9 (Чирчик-1992) at 2022-08-02
@ -194,7 +431,7 @@ New:
- Support build by MinGW' make from command line without CMake.
- Added `mdbx::filesystem` C++ API namespace that corresponds to `std::filesystem` or `std::experimental::filesystem`.
- Created [website](https://libmdbx.dqdkfa.ru/) for online auto-generated documentation.
- Used `https://web.archive.org/web/20220414235959/https://github.com/erthink/` for dead (or temporarily lost) resources deleted by ~~Github~~.
- Used `https://web.archive.org/web/https://github.com/erthink/libmdbx` for dead (or temporarily lost) resources deleted by ~~Github~~.
- Added `--loglevel=` command-line option to the `mdbx_test` tool.
- Added few fast smoke-like tests into CMake builds.
@ -234,7 +471,7 @@ Minors:
The stable release with the complete workaround for an incoherence flaw of Linux unified page/buffer cache.
Nonetheless the cause for this trouble may be an issue of Intel CPU cache/MESI.
See [issue#269](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for more information.
See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information.
Acknowledgements:
@ -243,8 +480,8 @@ Acknowledgements:
Fixes:
- [Added complete workaround](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache.
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions.
- [Added complete workaround](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions.
- Fixed copy&paste typo inside `mdbx::cursor::find_multivalue()`.
Minors:
@ -259,7 +496,7 @@ Minors:
## v0.11.5 at 2022-02-23
The release with the temporary hotfix for a flaw of Linux unified page/buffer cache.
See [issue#269](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for more information.
See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information.
Acknowledgements:
@ -269,10 +506,10 @@ Acknowledgements:
Fixes:
- [Added hotfix](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache.
- [Fixed/Reworked](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API.
- [Added hotfix](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache.
- [Fixed/Reworked](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API.
- Fixed potential `SIGSEGV` while open DB with overrided non-default page size.
- [Made](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases.
- [Made](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases.
- Refined/Fixed pages reservation inside `mdbx_update_gc()` to avoid non-reclamation in a rare cases.
- Fixed typo in a retained space calculation for the hsr-callback.
@ -305,15 +542,15 @@ New features, extensions and improvements:
Fixes:
- Fixed handling `MDBX_opt_rp_augment_limit` for GC's records from huge transactions (Erigon/Akula/Ethereum).
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`).
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`).
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`.
- Fixed missing `&` for `std::ostream &operator<<()` overloads.
- Fixed unexpected `EXDEV` (Cross-device link) error from `mdbx_env_copy()`.
- Fixed base64 encoding/decoding bugs in auxillary C++ API.
- Fixed overflow of `pgno_t` during checking PNL on 64-bit platforms.
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling.
- Reworked checking `MAX_PAGENO` and DB upper-size geometry limit.
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK.
Minors:
@ -340,10 +577,10 @@ Acknowledgements:
New features, extensions and improvements:
- [Added](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`.
- [Added](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`.
- [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`.
- [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`.
- C++ API is finalized now.
- The GC update stage has been [significantly speeded](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem).
- The GC update stage has been [significantly speeded](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem).
Fixes:
@ -354,12 +591,12 @@ Minors:
- Fixed returning `MDBX_RESULT_TRUE` (unexpected -1) from `mdbx_env_set_option()`.
- Added `mdbx_env_get_syncbytes()` and `mdbx_env_get_syncperiod()`.
- [Clarified](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`.
- [Clarified](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`.
- Reworked/simplified `mdbx_env_sync_internal()`.
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases.
- Avoiding extra looping inside `mdbx_env_info_ex()`.
- Explicitly enabled core dumps from stochastic tests scripts on Linux.
- [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions.
- [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions.
- For compatibility reverted returning `MDBX_ENODATA`for some cases.
@ -375,10 +612,10 @@ Acknowledgements:
Fixes:
- [Fixed compilation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7.
- [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page.
- [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation.
- [Fixed compilation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers.
- [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7.
- [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page.
- [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation.
- [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers.
Minors:
@ -395,7 +632,7 @@ Minors:
The database format signature has been changed to prevent
forward-interoperability with an previous releases, which may lead to a
[false positive diagnosis of database corruption](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/238)
[false positive diagnosis of database corruption](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/238)
due to flaws of an old library versions.
This change is mostly invisible:
@ -447,7 +684,7 @@ Acknowledgements:
Fixes:
- Fixed possibility of looping update GC during transaction commit (no public issue since the problem was discovered inside [Positive Technologies](https://www.ptsecurity.ru)).
- Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/235).
- Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/235).
- Fixed `noexcept` for potentially throwing `txn::put()` of C++ API.
Minors:
@ -473,7 +710,7 @@ Extensions and improvements:
Fixes:
- Always setup `madvise` while opening DB (fixes https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/231).
- Always setup `madvise` while opening DB (fixes https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/231).
- Fixed checking legacy `P_DIRTY` flag (`0x10`) for nested/sub-pages.
Minors:
@ -494,11 +731,11 @@ Acknowledgements:
- [Lionel Debroux](https://github.com/debrouxl) for fuzzing tests and reporting bugs.
- [Sergey Fedotov](https://github.com/SergeyFromHell/) for [`node-mdbx` NodeJS bindings](https://www.npmjs.com/package/node-mdbx).
- [Kris Zyp](https://github.com/kriszyp) for [`lmdbx-store` NodeJS bindings](https://github.com/kriszyp/lmdbx-store).
- [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/commits/python-bindings).
- [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings).
New features, extensions and improvements:
- [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/201).
- [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/201).
- Added options support for `long-stochastic` script.
- Avoided `MDBX_TXN_FULL` error for large transactions when possible.
- The `MDBX_READERS_LIMIT` increased to `32767`.
@ -506,7 +743,7 @@ New features, extensions and improvements:
- Minimized the size of poisoned/unpoisoned regions to avoid Valgrind/ASAN stuck.
- Added more workarounds for QEMU for testing builds for 32-bit platforms, Alpha and Sparc architectures.
- `mdbx_chk` now skips iteration & checking of DB' records if corresponding page-tree is corrupted (to avoid `SIGSEGV`, ASAN failures, etc).
- Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/217).
- Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217).
Backward compatibility break:
@ -518,18 +755,18 @@ Backward compatibility break:
Fixes:
- Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors.
- Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/203).
- Fixed [log a warning during a new DB creation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/205).
- Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/207).
- Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/208).
- Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/209).
- Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/191).
- Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/203).
- Fixed [log a warning during a new DB creation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/205).
- Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/207).
- Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/208).
- Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/209).
- Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/191).
- Fixed `safe64_reset()` for platforms without atomic 64-bit compare-and-swap.
- Fixed hang/shutdown on big-endian platforms without `__cxa_thread_atexit()`.
- Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/217).
- Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217).
- Fixed extra `noexcept` for `buffer::&assign_reference()`.
- Fixed `bootid` generation on Windows for case of change system' time.
- Fixed [test framework keygen-related issue](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/127).
- Fixed [test framework keygen-related issue](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/127).
## v0.10.1 at 2021-06-01
@ -550,10 +787,10 @@ New features:
Fixes:
- Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library.
- Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/197).
- Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/197).
- Fixed `#elsif` typo.
- Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/195).
- Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/97).
- Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/195).
- Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/97).
## v0.10.0 at 2021-05-09
@ -576,7 +813,7 @@ New features:
and conjointly with the `MDBX_ENV_CHECKPID=0` and `MDBX_TXN_CHECKOWNER=0` options can yield
up to 30% more performance compared to LMDB.
- Using float point (exponential quantized) representation for internal 16-bit values
of grow step and shrink threshold when huge ones (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/166).
of grow step and shrink threshold when huge ones (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/166).
To minimize the impact on compatibility, only the odd values inside the upper half
of the range (i.e. 32769..65533) are used for the new representation.
- Added the `mdbx_drop` similar to LMDB command-line tool to purge or delete (sub)database(s).
@ -585,7 +822,7 @@ New features:
- The internal node sizes were refined, resulting in a reduction in large/overflow pages in some use cases
and a slight increase in limits for a keys size to ≈½ of page size.
- Added to `mdbx_chk` output number of keys/items on pages.
- Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/180).
- Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/180).
- Major rework page splitting (af9b7b560505684249b76730997f9e00614b8113) for
- An "auto-appending" feature upon insertion for both ascending and
descending key sequences. As a result, the optimality of page filling
@ -593,7 +830,7 @@ New features:
inserting ordered sequences of keys,
- A "splitting at middle" to make page tree more balanced on average.
- Added `mdbx_get_sysraminfo()` to the API.
- Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/183).
- Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183).
- Major rework internal labeling of a dirty pages (958fd5b9479f52f2124ab7e83c6b18b04b0e7dda) for
a "transparent spilling" feature with the gist to make a dirty pages
be ready to spilling (writing to a disk) without further altering ones.
@ -609,7 +846,7 @@ New features:
- Support `make help` to list available make targets.
- Silently `make`'s build by default.
- Preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) is available now
by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/147).
by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/147).
Backward compatibility break:
@ -624,22 +861,22 @@ Backward compatibility break:
Fixes:
- Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/160).
- Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/168).
- Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/160).
- Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/168).
- Fixed (disallowing) implicit subDB deletion via operations on `@MAIN`'s DBI-handle.
- Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/171).
- Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/170).
- Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/179).
- Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/183).
- Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171).
- Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/170).
- Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/179).
- Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183).
- Fixed `constexpr` specifier for the `slice::invalid()`.
- Fixed (no)readahead auto-handling (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/164).
- Fixed (no)readahead auto-handling (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/164).
- Fixed non-alloy build for Windows.
- Switched to using Heap-functions instead of LocalAlloc/LocalFree on Windows.
- Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/190).
- Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/190).
- Fixed building by GCC 4.8.5 (added workaround for a preprocessor's bug).
- Fixed building C++ part for iOS <= 13.0 (unavailability of `std::filesystem::path`).
- Fixed building for Windows target versions prior to Windows Vista (`WIN32_WINNT < 0x0600`).
- Fixed building by MinGW for Windows (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/155).
- Fixed building by MinGW for Windows (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/155).
-------------------------------------------------------------------------------
@ -662,7 +899,7 @@ Removed options and features:
New features:
- Package for FreeBSD is available now by Mahlon E. Smith.
- New API functions to get/set various options (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/128):
- New API functions to get/set various options (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/128):
- the maximum number of named databases for the environment;
- the maximum number of threads/reader slots;
- threshold (since the last unsteady commit) to force flush the data buffers to disk;
@ -675,7 +912,7 @@ New features:
- maximal part of the dirty pages may be spilled when necessary;
- minimal part of the dirty pages should be spilled when necessary;
- how much of the parent transaction dirty pages will be spilled while start each child transaction;
- Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/123).
- Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123).
- Added `-p` option (purge subDB before loading) to `mdbx_load` tool.
- Reworked spilling of large transaction and committing of nested transactions:
- page spilling code reworked to avoid the flaws and bugs inherited from LMDB;
@ -685,22 +922,22 @@ New features:
- Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options.
- Added `mdbx_default_pagesize()` function.
- Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic).
- Speed up page number lists and dirty page lists (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/132).
- Speed up page number lists and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/132).
- Added `LIBMDBX_NO_EXPORTS_LEGACY_API` build option.
Fixes:
- Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/143).
- Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/143).
- Fixed `mdbx_realloc()` for case of nullptr and `MDBX_WITHOUT_MSVC_CRT=ON` for Windows.
- Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/146).
- Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/153).
- Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/146).
- Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153).
- Fixed minor/potential memory leak during page flushing and unspilling.
- Fixed handling states of cursors's and subDBs's for nested transactions.
- Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit.
- Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/153).
- Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/123).
- Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153).
- Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123).
- Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint.
- Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/157).
- Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/157).
## v0.9.2 at 2020-11-27
@ -738,11 +975,11 @@ Fixes:
- Fixed copy&paste typos.
- Fixed minor false-positive GCC warning.
- Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK.
- Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/121).
- Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/121).
- Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`.
- Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/131).
- Fixed spilled pages checking (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/126).
- Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/136).
- Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/131).
- Fixed spilled pages checking (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/126).
- Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/136).
- Fixed save/restore/commit of cursors for nested transactions.
- Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on).
- Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation).

View File

@ -353,7 +353,7 @@ named mutexes are used.
Historically, _libmdbx_ is a deeply revised and extended descendant of the
[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
At first the development was carried out within the
[ReOpenLDAP](https://web.archive.org/web/20220414235959/https://github.com/erthink/ReOpenLDAP) project. About a
[ReOpenLDAP](https://web.archive.org/web/https://github.com/erthink/ReOpenLDAP) project. About a
year later _libmdbx_ was separated into a standalone project, which was
[presented at Highload++ 2015
conference](http://www.highload.ru/2015/abstracts/1831.html).
@ -435,7 +435,7 @@ unexpected or broken down.
### Testing
The amalgamated source code does not contain any tests for or several reasons.
Please read [the explanation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this.
Please read [the explanation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this.
So for testing _libmdbx_ itself you need a full source code, i.e. the clone of a git repository, there is no option.
The full source code of _libmdbx_ has a [`test` subdirectory](https://gitflic.ru/project/erthink/libmdbx/tree/master/test) with minimalistic test "framework".
@ -618,7 +618,7 @@ Bindings
| Rust | [libmdbx-rs](https://github.com/vorot93/libmdbx-rs) | [Artem Vorotnikov](https://github.com/vorot93) |
| Rust | [mdbx](https://crates.io/crates/mdbx) | [gcxfd](https://github.com/gcxfd) |
| Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
| Python (draft) | [python-bindings](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi)
| Python (draft) | [python-bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi)
| .NET (obsolete) | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
<!-- section-end -->

View File

@ -1 +1 @@
0.12.1.0
0.12.2.0

View File

@ -13,7 +13,9 @@
## limitations under the License.
##
if(CMAKE_VERSION VERSION_LESS 3.12)
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)
elseif(CMAKE_VERSION VERSION_LESS 3.12)
cmake_minimum_required(VERSION 3.8.2)
else()
cmake_minimum_required(VERSION 3.12)
@ -203,9 +205,38 @@ endif()
if(NOT CMAKE_SYSTEM_ARCH)
if(CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID)
set(CMAKE_SYSTEM_ARCH "${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID}")
string(TOLOWER "${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID}" CMAKE_SYSTEM_ARCH)
if(CMAKE_SYSTEM_ARCH STREQUAL "x86")
set(X86_32 TRUE)
elseif(CMAKE_SYSTEM_ARCH STREQUAL "x86_64" OR CMAKE_SYSTEM_ARCH STREQUAL "x64")
set(X86_64 TRUE)
set(CMAKE_SYSTEM_ARCH "x86_64")
elseif(CMAKE_SYSTEM_ARCH MATCHES "^(aarch.*|arm.*)")
if(CMAKE_TARGET_BITNESS EQUAL 64)
set(AARCH64 TRUE)
else()
set(ARM32 TRUE)
endif()
endif()
elseif(CMAKE_ANDROID_ARCH_ABI)
set(CMAKE_SYSTEM_ARCH "${CMAKE_ANDROID_ARCH_ABI}")
if(CMAKE_SYSTEM_ARCH STREQUAL "x86")
set(X86_32 TRUE)
elseif(CMAKE_SYSTEM_ARCH STREQUAL "x86_64")
set(X86_64 TRUE)
elseif(CMAKE_SYSTEM_ARCH MATCHES "^(aarch.*|AARCH.*|arm.*|ARM.*)")
if(CMAKE_TARGET_BITNESS EQUAL 64)
set(AARCH64 TRUE)
else()
set(ARM32 TRUE)
endif()
elseif(CMAKE_SYSTEM_ARCH MATCHES "^(mips|MIPS).*")
if(CMAKE_TARGET_BITNESS EQUAL 64)
set(MIPS64 TRUE)
else()
set(MIPS32 TRUE)
endif()
endif()
elseif(CMAKE_COMPILER_IS_ELBRUSC OR CMAKE_COMPILER_IS_ELBRUSCXX
OR CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ID STREQUAL "LCC"
OR CMAKE_SYSTEM_PROCESSOR MATCHES "e2k.*|E2K.*|elbrus.*|ELBRUS.*")
@ -929,12 +960,13 @@ endmacro(setup_compile_flags)
macro(probe_libcxx_filesystem)
if(CMAKE_CXX_COMPILER_LOADED AND NOT DEFINED LIBCXX_FILESYSTEM)
list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_11 HAS_CXX11)
if(NOT HAS_CXX11 LESS 0)
if(NOT HAS_CXX11 LESS 0 OR CXX_FALLBACK_GNU11 OR CXX_FALLBACK_11)
include(CMakePushCheckState)
include(CheckCXXSourceCompiles)
cmake_push_check_state()
set(stdfs_probe_save_libraries ${CMAKE_REQUIRED_LIBRARIES})
set(stdfs_probe_save_flags ${CMAKE_REQUIRED_FLAGS})
set(stdfs_probe_flags ${CMAKE_REQUIRED_FLAGS})
set(stdfs_probe_save_link_options ${CMAKE_REQUIRED_LINK_OPTIONS})
unset(stdfs_probe_clear_cxx_standard)
if(NOT DEFINED CMAKE_CXX_STANDARD)
@ -945,18 +977,23 @@ macro(probe_libcxx_filesystem)
set(CMAKE_CXX_STANDARD 17)
elseif(NOT HAS_CXX14 LESS 0)
set(CMAKE_CXX_STANDARD 14)
else()
elseif(NOT HAS_CXX11 LESS 0)
set(CMAKE_CXX_STANDARD 11)
elseif(CXX_FALLBACK_GNU11)
set(stdfs_probe_flags ${stdfs_probe_flags} "-std=gnu++11")
else()
set(stdfs_probe_flags ${stdfs_probe_flags} "-std=c++11")
endif()
set(stdfs_probe_clear_cxx_standard ON)
endif()
if(CMAKE_COMPILER_IS_ELBRUSCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1.25.23)
if(CMAKE_VERSION VERSION_LESS 3.14)
set(CMAKE_REQUIRED_FLAGS ${stdfs_probe_save_flags} "-Wl,--allow-multiple-definition")
set(stdfs_probe_flags ${stdfs_probe_flags} "-Wl,--allow-multiple-definition")
else()
set(CMAKE_REQUIRED_LINK_OPTIONS ${stdfs_probe_save_link_options} "-Wl,--allow-multiple-definition")
endif()
endif()
set(CMAKE_REQUIRED_FLAGS ${stdfs_probe_flags})
set(stdfs_probe_code [[
#if defined(__SIZEOF_INT128__) && !defined(__GLIBCXX_TYPE_INT_N_0) && defined(__clang__) && __clang_major__ < 4
@ -1037,6 +1074,7 @@ macro(probe_libcxx_filesystem)
unset(stdfs_probe_clear_cxx_standard)
unset(stdfs_probe_save_link_options)
unset(stdfs_probe_save_flags)
unset(stdfs_probe_flags)
unset(stdfs_probe_save_libraries)
unset(stdfs_probe_code)
unset(stdfs_probe_rc)

View File

@ -13,7 +13,9 @@
## limitations under the License.
##
if(CMAKE_VERSION VERSION_LESS 3.12)
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)
elseif(CMAKE_VERSION VERSION_LESS 3.12)
cmake_minimum_required(VERSION 3.8.2)
else()
cmake_minimum_required(VERSION 3.12)

View File

@ -13,7 +13,9 @@
## limitations under the License.
##
if(CMAKE_VERSION VERSION_LESS 3.12)
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)
elseif(CMAKE_VERSION VERSION_LESS 3.12)
cmake_minimum_required(VERSION 3.8.2)
else()
cmake_minimum_required(VERSION 3.12)

View File

@ -27,6 +27,12 @@
#cmakedefine01 MDBX_TRUST_RTC
#endif
#cmakedefine01 MDBX_DISABLE_VALIDATION
#cmakedefine01 MDBX_AVOID_MSYNC
#cmakedefine01 MDBX_ENABLE_REFUND
#cmakedefine01 MDBX_ENABLE_MADVISE
#cmakedefine01 MDBX_ENABLE_BIGFOOT
#cmakedefine01 MDBX_ENABLE_PGOP_STAT
#cmakedefine01 MDBX_ENABLE_PROFGC
/* Windows */
#cmakedefine01 MDBX_WITHOUT_MSVC_CRT

View File

@ -1,6 +1,6 @@
.\" Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_CHK 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_CHK 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_chk \- MDBX checking tool
.SH SYNOPSIS
@ -81,6 +81,13 @@ Turn to a specified meta-page on successful check.
.BR \-T
Turn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!
.TP
.BR \-u
Warms up the DB before checking via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before checking, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n
Open MDBX environment(s) which do not use subdirectories.
This is legacy option. For now MDBX handles this automatically.

View File

@ -2,7 +2,7 @@
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_COPY 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_COPY 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_copy \- MDBX environment copy tool
.SH SYNOPSIS
@ -45,6 +45,13 @@ or unused pages will be omitted from the copy. This option will
slow down the backup process as it is more CPU-intensive.
Currently it fails if the environment has suffered a page leak.
.TP
.BR \-u
Warms up the DB before copying via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before copying, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n
Open MDBX environment(s) which do not use subdirectories.
This is legacy option. For now MDBX handles this automatically.

View File

@ -1,7 +1,7 @@
.\" Copyright 2021-2022 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DROP 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_DROP 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_drop \- MDBX database delete tool
.SH SYNOPSIS

View File

@ -2,7 +2,7 @@
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DUMP 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_DUMP 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_dump \- MDBX environment export tool
.SH SYNOPSIS
@ -66,6 +66,13 @@ Dump a specific subdatabase. If no database is specified, only the main database
.BR \-r
Rescure mode. Ignore some errors to dump corrupted DB.
.TP
.BR \-u
Warms up the DB before dumping via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before dumping, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n
Dump an MDBX database which does not use subdirectories.
This is legacy option. For now MDBX handles this automatically.

View File

@ -2,7 +2,7 @@
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_LOAD 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_LOAD 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_load \- MDBX environment import tool
.SH SYNOPSIS

View File

@ -2,7 +2,7 @@
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_STAT 1 "2022-08-24" "MDBX 0.12.1"
.TH MDBX_STAT 1 "2022-11-11" "MDBX 0.12.2"
.SH NAME
mdbx_stat \- MDBX environment status tool
.SH SYNOPSIS

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,7 @@
* <http://www.OpenLDAP.org/license.html>. */
#define xMDBX_ALLOY 1
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -127,7 +127,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h++"
/*
@ -194,7 +198,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -367,10 +371,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -394,8 +394,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1147,9 +1149,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1180,18 +1179,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1230,8 +1227,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1257,10 +1386,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1303,8 +1475,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1332,12 +1503,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1371,7 +1546,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1414,9 +1589,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1526,6 +1708,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1659,6 +1844,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1773,6 +1963,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1792,7 +1989,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1821,23 +2018,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1894,6 +2090,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1949,7 +2166,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2035,13 +2255,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2053,6 +2267,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2287,7 +2507,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2491,14 +2711,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2510,10 +2750,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2644,13 +2905,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2661,20 +2925,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2777,7 +3041,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2787,10 +3051,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2809,11 +3073,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2823,8 +3093,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2843,6 +3113,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2864,9 +3137,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2925,18 +3202,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2947,8 +3224,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3002,9 +3279,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3015,7 +3290,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3064,14 +3339,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3143,6 +3424,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3168,7 +3450,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3273,10 +3555,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3347,7 +3641,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3468,12 +3764,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3530,20 +3826,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}
@ -3603,7 +3903,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
@ -5013,6 +5313,15 @@ void txn_managed::commit() {
MDBX_CXX20_UNLIKELY err.throw_exception();
}
void txn_managed::commit(commit_latency *latency) {
const error err =
static_cast<MDBX_error_t>(::mdbx_txn_commit_ex(handle_, latency));
if (MDBX_LIKELY(err.code() != MDBX_THREAD_MISMATCH))
MDBX_CXX20_LIKELY handle_ = nullptr;
if (MDBX_UNLIKELY(err.code() != MDBX_SUCCESS))
MDBX_CXX20_UNLIKELY err.throw_exception();
}
//------------------------------------------------------------------------------
bool txn::drop_map(const char *name, bool throw_if_absent) {

View File

@ -75,6 +75,14 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#ifndef LIBMDBX_H
#define LIBMDBX_H
#if defined(__riscv) || defined(__riscv__) || defined(__RISCV) || \
defined(__RISCV__)
#warning The RISC-V architecture is intentionally insecure by design. \
Please delete this admonition at your own risk, \
if you make such decision informed and consciously. \
Refer to https://clck.ru/32d9xH for more information.
#endif /* RISC-V */
#ifdef _MSC_VER
#pragma warning(push, 1)
#pragma warning(disable : 4548) /* expression before comma has no effect; \
@ -830,8 +838,14 @@ enum MDBX_constants {
#if !(defined(_WIN32) || defined(_WIN64))
#define MDBX_LOCKNAME "/mdbx.lck"
#else
#define MDBX_LOCKNAME L"\\mdbx.lck"
#endif
#define MDBX_LOCKNAME_W L"\\mdbx.lck"
#define MDBX_LOCKNAME_A "\\mdbx.lck"
#ifdef UNICODE
#define MDBX_LOCKNAME MDBX_LOCKNAME_W
#else
#define MDBX_LOCKNAME MDBX_LOCKNAME_A
#endif /* UNICODE */
#endif /* Windows */
#endif /* MDBX_LOCKNAME */
#ifndef MDBX_DATANAME
/** \brief The name of the data file in the environment
@ -839,8 +853,14 @@ enum MDBX_constants {
#if !(defined(_WIN32) || defined(_WIN64))
#define MDBX_DATANAME "/mdbx.dat"
#else
#define MDBX_DATANAME L"\\mdbx.dat"
#endif
#define MDBX_DATANAME_W L"\\mdbx.dat"
#define MDBX_DATANAME_A "\\mdbx.dat"
#ifdef UNICODE
#define MDBX_DATANAME MDBX_DATANAME_W
#else
#define MDBX_DATANAME MDBX_DATANAME_A
#endif /* UNICODE */
#endif /* Windows */
#endif /* MDBX_DATANAME */
#ifndef MDBX_LOCK_SUFFIX
@ -848,8 +868,14 @@ enum MDBX_constants {
#if !(defined(_WIN32) || defined(_WIN64))
#define MDBX_LOCK_SUFFIX "-lck"
#else
#define MDBX_LOCK_SUFFIX L"-lck"
#endif
#define MDBX_LOCK_SUFFIX_W L"-lck"
#define MDBX_LOCK_SUFFIX_A "-lck"
#ifdef UNICODE
#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_W
#else
#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_A
#endif /* UNICODE */
#endif /* Windows */
#endif /* MDBX_LOCK_SUFFIX */
/* DEBUG & LOGGING ************************************************************/
@ -970,8 +996,16 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t)
* called before printing the message and aborting.
* \see mdbx_setup_debug()
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] msg The assertion message, not including newline. */
* \param [in] loglevel The severity of message.
* \param [in] function The function name which emits message,
* may be NULL.
* \param [in] line The source code line number which emits message,
* may be zero.
* \param [in] fmt The printf-like format string with message.
* \param [in] args The variable argument list respectively for the
* format-message string passed by `fmt` argument.
* Maybe NULL or invalid if the format-message string
* don't contain `%`-specification of arguments. */
typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function,
int line, const char *fmt,
va_list args) MDBX_CXX17_NOEXCEPT;
@ -990,8 +1024,12 @@ LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level,
* called before printing the message and aborting.
* \see mdbx_env_set_assert()
*
* \param [in] env An environment handle returned by mdbx_env_create().
* \param [in] msg The assertion message, not including newline. */
* \param [in] env An environment handle.
* \param [in] msg The assertion message, not including newline.
* \param [in] function The function name where the assertion check failed,
* may be NULL.
* \param [in] line The line number in the source file
* where the assertion check failed, may be zero. */
typedef void MDBX_assert_func(const MDBX_env *env, const char *msg,
const char *function,
unsigned line) MDBX_CXX17_NOEXCEPT;
@ -1020,12 +1058,15 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf,
const size_t bufsize);
/** \brief Panics with message and causes abnormal process termination. */
LIBMDBX_API void mdbx_panic(const char *fmt, ...) MDBX_PRINTF_ARGS(1, 2);
MDBX_NORETURN LIBMDBX_API void mdbx_panic(const char *fmt, ...)
MDBX_PRINTF_ARGS(1, 2);
/** \brief Panics with asserton failed message and causes abnormal process
* termination. */
LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg,
const char *func, unsigned line);
MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env,
const char *msg,
const char *func,
unsigned line);
/** end of c_debug @} */
/** \brief Environment flags
@ -1593,8 +1634,7 @@ enum MDBX_put_flags_t {
MDBX_NOOVERWRITE = UINT32_C(0x10),
/** Has effect only for \ref MDBX_DUPSORT databases.
* For upsertion: don't write if the key-value pair already exist.
* For deletion: remove all values for key. */
* For upsertion: don't write if the key-value pair already exist. */
MDBX_NODUPDATA = UINT32_C(0x20),
/** For upsertion: overwrite the current key/data pair.
@ -2424,7 +2464,7 @@ typedef struct MDBX_stat MDBX_stat;
/** \brief Return statistics about the MDBX environment.
* \ingroup c_statinfo
*
* At least one of env or txn argument must be non-null. If txn is passed
* At least one of `env` or `txn` argument must be non-null. If txn is passed
* non-null then stat will be filled accordingly to the given transaction.
* Otherwise, if txn is null, then stat will be populated by a snapshot from
* the last committed write transaction, and at next time, other information
@ -2495,7 +2535,9 @@ struct MDBX_envinfo {
uint64_t mi_unsync_volume;
/** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */
uint64_t mi_autosync_threshold;
/** Time since the last steady sync in 1/65536 of second */
/** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of
* second. In other words, this is the time since the last non-steady commit
* or zero if it was steady. */
uint32_t mi_since_sync_seconds16dot16;
/** Current auto-sync period in 1/65536 of second,
* see \ref mdbx_env_set_syncperiod(). */
@ -2524,8 +2566,9 @@ struct MDBX_envinfo {
uint64_t wops; /**< Number of explicit write operations (not a pages)
to a disk */
uint64_t
gcrtime_seconds16dot16; /**< Time spent loading and searching inside
GC (aka FreeDB) in 1/65536 of second. */
msync; /**< Number of explicit msync-to-disk operations (not a pages) */
uint64_t
fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */
} mi_pgop_stat;
};
#ifndef __cplusplus
@ -2536,7 +2579,7 @@ typedef struct MDBX_envinfo MDBX_envinfo;
/** \brief Return information about the MDBX environment.
* \ingroup c_statinfo
*
* At least one of env or txn argument must be non-null. If txn is passed
* At least one of `env` or `txn` argument must be non-null. If txn is passed
* non-null then stat will be filled accordingly to the given transaction.
* Otherwise, if txn is null, then stat will be populated by a snapshot from
* the last committed write transaction, and at next time, other information
@ -2782,6 +2825,94 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) {
return mdbx_env_close_ex(env, false);
}
/** \brief Warming up options
* \ingroup c_settings
* \anchor warmup_flags
* \see mdbx_env_warmup() */
enum MDBX_warmup_flags_t {
/** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously
* prefetch database pages. */
MDBX_warmup_default = 0,
/** Peeking all pages of allocated portion of the database
* to force ones to be loaded into memory. However, the pages are just peeks
* sequentially, so unused pages that are in GC will be loaded in the same
* way as those that contain payload. */
MDBX_warmup_force = 1,
/** Using system calls to peeks pages instead of directly accessing ones,
* which at the cost of additional overhead avoids killing the current
* process by OOM-killer in a lack of memory condition.
* \note Has effect only on POSIX (non-Windows) systems with conjunction
* to \ref MDBX_warmup_force option. */
MDBX_warmup_oomsafe = 2,
/** Try to lock database pages in memory by `mlock()` on POSIX-systems
* or `VirtualLock()` on Windows. Please refer to description of these
* functions for reasonability of such locking and the information of
* effects, including the system as a whole.
*
* Such locking in memory requires that the corresponding resource limits
* (e.g. `RLIMIT_RSS`, `RLIMIT_MEMLOCK` or process working set size)
* and the availability of system RAM are sufficiently high.
*
* On successful, all currently allocated pages, both unused in GC and
* containing payload, will be locked in memory until the environment closes,
* or explicitly unblocked by using \ref MDBX_warmup_release, or the
* database geomenry will changed, including its auto-shrinking. */
MDBX_warmup_lock = 4,
/** Alters corresponding current resource limits to be enough for lock pages
* by \ref MDBX_warmup_lock. However, this option should be used in simpliest
* applications since takes into account only current size of this environment
* disregarding all other factors. For real-world database application you
* will need full-fledged management of resources and their limits with
* respective engineering. */
MDBX_warmup_touchlimit = 8,
/** Release the lock that was performed before by \ref MDBX_warmup_lock. */
MDBX_warmup_release = 16,
};
#ifndef __cplusplus
typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t)
#endif
/** \brief Warms up the database by loading pages into memory, optionally lock
* ones. \ingroup c_settings
*
* Depending on the specified flags, notifies OS kernel about following access,
* force loads the database pages, including locks ones in memory or releases
* such a lock. However, the function does not analyze the b-tree nor the GC.
* Therefore an unused pages that are in GC handled (i.e. will be loaded) in
* the same way as those that contain payload.
*
* At least one of `env` or `txn` argument must be non-null.
*
* \param [in] env An environment handle returned
* by \ref mdbx_env_create().
* \param [in] txn A transaction handle returned
* by \ref mdbx_txn_begin().
* \param [in] flags The \ref warmup_flags, bitwise OR'ed together.
*
* \param [in] timeout_seconds_16dot16 Optional timeout which checking only
* during explicitly peeking database pages
* for loading ones if the \ref MDBX_warmup_force
* option was spefified.
*
* \returns A non-zero error value on failure and 0 on success.
* Some possible errors are:
*
* \retval MDBX_ENOSYS The system does not support requested
* operation(s).
*
* \retval MDBX_RESULT_TRUE The specified timeout is reached during load
* data into memory. */
LIBMDBX_API int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
MDBX_warmup_flags_t flags,
unsigned timeout_seconds_16dot16);
/** \brief Set environment flags.
* \ingroup c_settings
*
@ -3113,6 +3244,21 @@ mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags);
MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags);
/** \brief Returns maximal size of key-value pair to fit in a single page with
* the given size and database flags, or -1 if pagesize is invalid.
* \ingroup c_statinfo
* \see db_flags */
MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags);
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
* single overflow/large-page with the given page size and database flags,
* or -1 if pagesize is invalid.
* \ingroup c_statinfo
* \see db_flags */
MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
mdbx_limits_valsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags);
/** \brief Returns maximal write transaction size (i.e. limit for summary volume
* of dirty pages) in bytes for given page size, or -1 if pagesize is invalid.
* \ingroup c_statinfo */
@ -3268,6 +3414,32 @@ mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags);
MDBX_DEPRECATED MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int
mdbx_env_get_maxkeysize(const MDBX_env *env);
/** \brief Returns maximal size of key-value pair to fit in a single page
* for specified database flags.
* \ingroup c_statinfo
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY
* and so on). \see db_flags
*
* \returns The maximum size of a data can write,
* or -1 if something is wrong. */
MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int
mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags);
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
* single overflow/large-page for specified database flags.
* \ingroup c_statinfo
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY
* and so on). \see db_flags
*
* \returns The maximum size of a data can write,
* or -1 if something is wrong. */
MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int
mdbx_env_get_valsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags);
/** \brief Sets application information (a context pointer) associated with
* the environment.
* \see mdbx_env_get_userctx()
@ -3546,8 +3718,8 @@ struct MDBX_commit_latency {
/** \brief Duration of preparation (commit child transactions, update
* sub-databases records and cursors destroying). */
uint32_t preparation;
/** \brief Duration of GC/freeDB handling & updation. */
uint32_t gc;
/** \brief Duration of GC update by wall clock. */
uint32_t gc_wallclock;
/** \brief Duration of internal audit if enabled. */
uint32_t audit;
/** \brief Duration of writing dirty/modified data pages to a filesystem,
@ -3560,6 +3732,80 @@ struct MDBX_commit_latency {
uint32_t ending;
/** \brief The total duration of a commit. */
uint32_t whole;
/** \brief User-mode CPU time spent on GC update. */
uint32_t gc_cputime;
/** \brief Информация для профилирования работы GC.
* \note Статистика является общей для всех процессов работающих с одним
* файлом БД и хранится в LCK-файле. Данные аккумулируются при фиксации всех
* транзакций, но только в сборках libmdbx c установленной опцией
* \ref MDBX_ENABLE_PROFGC. Собранная статистика возвращаются любому процессу
* при использовании \ref mdbx_txn_commit_ex() и одновременно обнуляется
* при завершении транзакций верхнего уровня (не вложенных). */
struct {
/** \brief Количество итераций обновления GC,
* больше 1 если были повторы/перезапуски. */
uint32_t wloops;
/** \brief Количество итераций слияния записей GC. */
uint32_t coalescences;
/** \brief Количество уничтожений предыдущих надежных/устойчивых
* точек фиксации при работе в режиме \ref MDBX_UTTERLY_NOSYNC. */
uint32_t wipes;
/** \brief Количество принудительных фиксаций на диск
* во избежания приращения БД при работе вне режима
* \ref MDBX_UTTERLY_NOSYNC. */
uint32_t flushes;
/** \brief Количество обращений к механизму Handle-Slow-Readers
* во избежания приращения БД.
* \see MDBX_hsr_func */
uint32_t kicks;
/** \brief Счетчик выполнения по медленному пути (slow path execution count)
* GC ради данных пользователя. */
uint32_t work_counter;
/** \brief Время "по настенным часам" затраченное на чтение и поиск внутри
* GC ради данных пользователя. */
uint32_t work_rtime_monotonic;
/** \brief Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC для данных пользователя,
* включая подкачку с диска. */
uint32_t work_xtime_monotonic;
/** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск
* внтури GC ради данных пользователя. */
uint32_t work_rtime_cpu;
/** \brief Количество итераций поиска внутри GC при выделении страниц
* ради данных пользователя. */
uint32_t work_rsteps;
/** \brief Количество запросов на выделение последовательностей страниц
* ради данных пользователя. */
uint32_t work_xpages;
/** \brief Количество страничных промахов (page faults) внутри GC
* при выделении и подготовки страниц для данных пользователя. */
uint32_t work_majflt;
/** \brief Счетчик выполнения по медленному пути (slow path execution count)
* GC для целей поддержки и обновления самой GC. */
uint32_t self_counter;
/** \brief Время "по настенным часам" затраченное на чтение и поиск внутри
* GC для целей поддержки и обновления самой GC. */
uint32_t self_rtime_monotonic;
/** \brief Монотонное время по "настенным часам" затраченное на подготовку
* страниц извлекаемых из GC для целей поддержки и обновления самой GC,
* включая подкачку с диска. */
uint32_t self_xtime_monotonic;
/** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск
* внтури GC для целей поддержки и обновления самой GC. */
uint32_t self_rtime_cpu;
/** \brief Количество итераций поиска внутри GC при выделении страниц
* для целей поддержки и обновления самой GC. */
uint32_t self_rsteps;
/** \brief Количество запросов на выделение последовательностей страниц
* для самой GC. */
uint32_t self_xpages;
/** \brief Количество страничных промахов (page faults) внутри GC
* при выделении и подготовки страниц для самой GC. */
uint32_t self_majflt;
} gc_prof;
};
#ifndef __cplusplus
/** \ingroup c_statinfo */

View File

@ -287,7 +287,7 @@ namespace mdbx {
// To enable all kinds of an compiler optimizations we use a byte-like type
// that don't presumes aliases for pointers as does the `char` type and its
// derivatives/typedefs.
// Please see todo4recovery://erased_by_github/libmdbx/issues/263
// Please see https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/263
// for reasoning of the use of `char8_t` type and switching to `__restrict__`.
using byte = char8_t;
#else
@ -3177,6 +3177,7 @@ public:
/// \brief Returns the minimal values size in bytes for specified values
/// mode.
static inline size_t value_min(value_mode) noexcept;
/// \brief Returns the maximal value size in bytes for specified page size
/// and database flags.
static inline size_t value_max(intptr_t pagesize, MDBX_db_flags_t flags);
@ -3189,6 +3190,35 @@ public:
/// \brief Returns the maximal value size in bytes for specified page size
/// and values mode.
static inline size_t value_max(const env &, value_mode);
/// \brief Returns maximal size of key-value pair to fit in a single page
/// for specified size and database flags.
static inline size_t pairsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags);
/// \brief Returns maximal size of key-value pair to fit in a single page
/// for specified page size and values mode.
static inline size_t pairsize4page_max(intptr_t pagesize, value_mode);
/// \brief Returns maximal size of key-value pair to fit in a single page
/// for given environment and database flags.
static inline size_t pairsize4page_max(const env &, MDBX_db_flags_t flags);
/// \brief Returns maximal size of key-value pair to fit in a single page
/// for specified page size and values mode.
static inline size_t pairsize4page_max(const env &, value_mode);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified size and database flags.
static inline size_t valsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified page size and values mode.
static inline size_t valsize4page_max(intptr_t pagesize, value_mode);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for given environment and database flags.
static inline size_t valsize4page_max(const env &, MDBX_db_flags_t flags);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified page size and values mode.
static inline size_t valsize4page_max(const env &, value_mode);
/// \brief Returns the maximal write transaction size (i.e. limit for
/// summary volume of dirty pages) in bytes for specified page size.
static inline size_t transaction_size_max(intptr_t pagesize);
@ -3875,12 +3905,31 @@ public:
//----------------------------------------------------------------------------
/// \brief Abandon all the operations of the transaction instead of saving
/// them.
/// \brief Abandon all the operations of the transaction
/// instead of saving ones.
void abort();
/// \brief Commit all the operations of a transaction into the database.
void commit();
using commit_latency = MDBX_commit_latency;
/// \brief Commit all the operations of a transaction into the database
/// and collect latency information.
void commit(commit_latency *);
/// \brief Commit all the operations of a transaction into the database
/// and collect latency information.
void commit(commit_latency &latency) { return commit(&latency); }
/// \brief Commit all the operations of a transaction into the database
/// and return latency information.
/// \returns latency information of commit stages.
commit_latency commit_get_latency() {
commit_latency result;
commit(&result);
return result;
}
};
/// \brief Unmanaged cursor.
@ -4863,6 +4912,56 @@ inline size_t env::limits::value_max(const env &env, value_mode mode) {
return value_max(env, MDBX_db_flags_t(mode));
}
inline size_t env::limits::pairsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
const intptr_t result = mdbx_limits_pairsize4page_max(pagesize, flags);
if (result < 0)
MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL);
return static_cast<size_t>(result);
}
inline size_t env::limits::pairsize4page_max(intptr_t pagesize,
value_mode mode) {
return pairsize4page_max(pagesize, MDBX_db_flags_t(mode));
}
inline size_t env::limits::pairsize4page_max(const env &env,
MDBX_db_flags_t flags) {
const intptr_t result = mdbx_env_get_pairsize4page_max(env, flags);
if (result < 0)
MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL);
return static_cast<size_t>(result);
}
inline size_t env::limits::pairsize4page_max(const env &env, value_mode mode) {
return pairsize4page_max(env, MDBX_db_flags_t(mode));
}
inline size_t env::limits::valsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
const intptr_t result = mdbx_limits_valsize4page_max(pagesize, flags);
if (result < 0)
MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL);
return static_cast<size_t>(result);
}
inline size_t env::limits::valsize4page_max(intptr_t pagesize,
value_mode mode) {
return valsize4page_max(pagesize, MDBX_db_flags_t(mode));
}
inline size_t env::limits::valsize4page_max(const env &env,
MDBX_db_flags_t flags) {
const intptr_t result = mdbx_env_get_valsize4page_max(env, flags);
if (result < 0)
MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL);
return static_cast<size_t>(result);
}
inline size_t env::limits::valsize4page_max(const env &env, value_mode mode) {
return valsize4page_max(env, MDBX_db_flags_t(mode));
}
inline size_t env::limits::transaction_size_max(intptr_t pagesize) {
const intptr_t result = mdbx_limits_txnsize_max(pagesize);
if (result < 0)

View File

@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -149,7 +149,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -216,7 +220,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -389,10 +393,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -416,8 +416,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1169,9 +1171,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1202,18 +1201,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1252,8 +1249,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1279,10 +1408,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1325,8 +1497,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1354,12 +1525,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1393,7 +1568,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1436,9 +1611,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1548,6 +1730,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1681,6 +1866,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1795,6 +1985,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1814,7 +2011,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1843,23 +2040,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1916,6 +2112,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1971,7 +2188,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2057,13 +2277,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2075,6 +2289,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2309,7 +2529,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2513,14 +2733,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2532,10 +2772,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2666,13 +2927,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2683,20 +2947,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2799,7 +3063,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2809,10 +3073,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2831,11 +3095,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2845,8 +3115,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2865,6 +3135,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2886,9 +3159,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2947,18 +3224,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2969,8 +3246,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3024,9 +3301,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3037,7 +3312,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3086,14 +3361,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3165,6 +3446,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3190,7 +3472,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3295,10 +3577,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3369,7 +3663,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3490,12 +3786,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3552,20 +3848,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}
@ -4605,21 +4905,24 @@ bailout:
}
static void usage(char *prog) {
fprintf(stderr,
"usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] "
"dbpath\n"
" -V\t\tprint version and exit\n"
" -v\t\tmore verbose, could be used multiple times\n"
" -q\t\tbe quiet\n"
" -c\t\tforce cooperative mode (don't try exclusive)\n"
" -w\t\twrite-mode checking\n"
" -d\t\tdisable page-by-page traversal of B-tree\n"
" -i\t\tignore wrong order errors (for custom comparators case)\n"
" -s subdb\tprocess a specific subdatabase only\n"
" -0|1|2\tforce using specific meta-page 0, or 2 for checking\n"
" -t\t\tturn to a specified meta-page on successful check\n"
" -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n",
prog);
fprintf(
stderr,
"usage: %s "
"[-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] [-u|U] dbpath\n"
" -V\t\tprint version and exit\n"
" -v\t\tmore verbose, could be used multiple times\n"
" -q\t\tbe quiet\n"
" -c\t\tforce cooperative mode (don't try exclusive)\n"
" -w\t\twrite-mode checking\n"
" -d\t\tdisable page-by-page traversal of B-tree\n"
" -i\t\tignore wrong order errors (for custom comparators case)\n"
" -s subdb\tprocess a specific subdatabase only\n"
" -u\t\twarmup database before checking\n"
" -U\t\twarmup and try lock database pages in memory before checking\n"
" -0|1|2\tforce using specific meta-page 0, or 2 for checking\n"
" -t\t\tturn to a specified meta-page on successful check\n"
" -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n",
prog);
exit(EXIT_INTERRUPTED);
}
@ -4758,6 +5061,8 @@ int main(int argc, char *argv[]) {
bool write_locked = false;
bool turn_meta = false;
bool force_turn_meta = false;
bool warmup = false;
MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default;
double elapsed;
#if defined(_WIN32) || defined(_WIN64)
@ -4781,6 +5086,7 @@ int main(int argc, char *argv[]) {
usage(prog);
for (int i; (i = getopt(argc, argv,
"uU"
"0"
"1"
"2"
@ -4840,7 +5146,7 @@ int main(int argc, char *argv[]) {
envflags &= ~MDBX_RDONLY;
#if MDBX_MMAP_INCOHERENT_FILE_WRITE
/* Temporary `workaround` for OpenBSD kernel's flaw.
* See todo4recovery://erased_by_github/libmdbx/issues/67 */
* See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */
envflags |= MDBX_WRITEMAP;
#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
break;
@ -4858,6 +5164,14 @@ int main(int argc, char *argv[]) {
case 'i':
ignore_wrong_order = true;
break;
case 'u':
warmup = true;
break;
case 'U':
warmup = true;
warmup_flags =
MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock;
break;
default:
usage(prog);
}
@ -4959,14 +5273,35 @@ int main(int argc, char *argv[]) {
(envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative");
if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) {
if (verbose) {
print(" - taking write lock...");
fflush(nullptr);
}
rc = mdbx_txn_lock(env, false);
if (rc != MDBX_SUCCESS) {
error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc));
goto bailout;
}
if (verbose)
print(" done\n");
write_locked = true;
}
if (warmup) {
if (verbose) {
print(" - warming up...");
fflush(nullptr);
}
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
if (MDBX_IS_ERROR(rc)) {
error("mdbx_env_warmup(flags %u) failed, error %d %s\n", warmup_flags, rc,
mdbx_strerror(rc));
goto bailout;
}
if (verbose)
print(" %s\n", rc ? "timeout" : "done");
}
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (rc) {
error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc));

View File

@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -149,7 +149,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -216,7 +220,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -389,10 +393,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -416,8 +416,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1169,9 +1171,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1202,18 +1201,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1252,8 +1249,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1279,10 +1408,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1325,8 +1497,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1354,12 +1525,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1393,7 +1568,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1436,9 +1611,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1548,6 +1730,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1681,6 +1866,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1795,6 +1985,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1814,7 +2011,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1843,23 +2040,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1916,6 +2112,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1971,7 +2188,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2057,13 +2277,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2075,6 +2289,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2309,7 +2529,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2513,14 +2733,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2532,10 +2772,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2666,13 +2927,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2683,20 +2947,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2799,7 +3063,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2809,10 +3073,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2831,11 +3095,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2845,8 +3115,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2865,6 +3135,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2886,9 +3159,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2947,18 +3224,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2969,8 +3246,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3024,9 +3301,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3037,7 +3312,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3086,14 +3361,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3165,6 +3446,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3190,7 +3472,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3295,10 +3577,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3369,7 +3663,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3490,12 +3786,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3552,20 +3848,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}
@ -3719,14 +4019,17 @@ static void signal_handler(int sig) {
#endif /* !WINDOWS */
static void usage(const char *prog) {
fprintf(stderr,
"usage: %s [-V] [-q] [-c] src_path [dest_path]\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -c\t\tenable compactification (skip unused pages)\n"
" src_path\tsource database\n"
" dest_path\tdestination (stdout if not specified)\n",
prog);
fprintf(
stderr,
"usage: %s [-V] [-q] [-c] [-u|U] src_path [dest_path]\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -c\t\tenable compactification (skip unused pages)\n"
" -u\t\twarmup database before copying\n"
" -U\t\twarmup and try lock database pages in memory before copying\n"
" src_path\tsource database\n"
" dest_path\tdestination (stdout if not specified)\n",
prog);
exit(EXIT_FAILURE);
}
@ -3737,6 +4040,8 @@ int main(int argc, char *argv[]) {
unsigned flags = MDBX_RDONLY;
unsigned cpflags = 0;
bool quiet = false;
bool warmup = false;
MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default;
for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) {
if (argv[1][1] == 'n' && argv[1][2] == '\0')
@ -3745,8 +4050,14 @@ int main(int argc, char *argv[]) {
cpflags |= MDBX_CP_COMPACT;
else if (argv[1][1] == 'q' && argv[1][2] == '\0')
quiet = true;
else if ((argv[1][1] == 'h' && argv[1][2] == '\0') ||
strcmp(argv[1], "--help") == 0)
else if (argv[1][1] == 'u' && argv[1][2] == '\0')
warmup = true;
else if (argv[1][1] == 'U' && argv[1][2] == '\0') {
warmup = true;
warmup_flags =
MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock;
} else if ((argv[1][1] == 'h' && argv[1][2] == '\0') ||
strcmp(argv[1], "--help") == 0)
usage(progname);
else if (argv[1][1] == 'V' && argv[1][2] == '\0') {
printf("mdbx_copy version %d.%d.%d.%d\n"
@ -3795,7 +4106,12 @@ int main(int argc, char *argv[]) {
if (rc == MDBX_SUCCESS)
rc = mdbx_env_open(env, argv[1], flags, 0);
if (rc == MDBX_SUCCESS) {
if (rc == MDBX_SUCCESS && warmup) {
act = "warming up";
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
}
if (!MDBX_IS_ERROR(rc)) {
act = "copying";
if (argc == 2) {
mdbx_filehandle_t fd;

View File

@ -36,7 +36,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -151,7 +151,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -218,7 +222,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -391,10 +395,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -418,8 +418,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1171,9 +1173,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1204,18 +1203,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1254,8 +1251,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1281,10 +1410,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1327,8 +1499,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1356,12 +1527,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1395,7 +1570,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1438,9 +1613,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1550,6 +1732,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1683,6 +1868,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1797,6 +1987,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1816,7 +2013,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1845,23 +2042,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1918,6 +2114,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1973,7 +2190,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2059,13 +2279,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2077,6 +2291,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2311,7 +2531,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2515,14 +2735,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2534,10 +2774,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2668,13 +2929,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2685,20 +2949,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2801,7 +3065,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2811,10 +3075,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2833,11 +3097,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2847,8 +3117,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2867,6 +3137,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2888,9 +3161,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2949,18 +3226,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2971,8 +3248,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3026,9 +3303,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3039,7 +3314,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3088,14 +3363,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3167,6 +3448,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3192,7 +3474,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3297,10 +3579,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3371,7 +3665,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3492,12 +3788,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3554,20 +3850,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}

View File

@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -149,7 +149,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -216,7 +220,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -389,10 +393,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -416,8 +416,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1169,9 +1171,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1202,18 +1201,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1252,8 +1249,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1279,10 +1408,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1325,8 +1497,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1354,12 +1525,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1393,7 +1568,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1436,9 +1611,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1548,6 +1730,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1681,6 +1866,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1795,6 +1985,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1814,7 +2011,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1843,23 +2040,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1916,6 +2112,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1971,7 +2188,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2057,13 +2277,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2075,6 +2289,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2309,7 +2529,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2513,14 +2733,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2532,10 +2772,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2666,13 +2927,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2683,20 +2947,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2799,7 +3063,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2809,10 +3073,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2831,11 +3095,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2845,8 +3115,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2865,6 +3135,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2886,9 +3159,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2947,18 +3224,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2969,8 +3246,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3024,9 +3301,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3037,7 +3312,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3086,14 +3361,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3165,6 +3446,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3190,7 +3472,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3295,10 +3577,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3369,7 +3663,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3490,12 +3786,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3552,20 +3848,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}
@ -3892,19 +4192,23 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
}
static void usage(void) {
fprintf(stderr,
"usage: %s [-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] "
"dbpath\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -f\t\twrite to file instead of stdout\n"
" -l\t\tlist subDBs and exit\n"
" -p\t\tuse printable characters\n"
" -r\t\trescue mode (ignore errors to dump corrupted DB)\n"
" -a\t\tdump main DB and all subDBs\n"
" -s name\tdump only the specified named subDB\n"
" \t\tby default dump only the main DB\n",
prog);
fprintf(
stderr,
"usage: %s "
"[-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] [-u|U] "
"dbpath\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -f\t\twrite to file instead of stdout\n"
" -l\t\tlist subDBs and exit\n"
" -p\t\tuse printable characters\n"
" -r\t\trescue mode (ignore errors to dump corrupted DB)\n"
" -a\t\tdump main DB and all subDBs\n"
" -s name\tdump only the specified named subDB\n"
" -u\t\twarmup database before dumping\n"
" -U\t\twarmup and try lock database pages in memory before dumping\n"
" \t\tby default dump only the main DB\n",
prog);
exit(EXIT_FAILURE);
}
@ -3925,11 +4229,14 @@ int main(int argc, char *argv[]) {
char *subname = nullptr, *buf4free = nullptr;
unsigned envflags = 0;
bool alldbs = false, list = false;
bool warmup = false;
MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default;
if (argc < 2)
usage();
while ((i = getopt(argc, argv,
"uU"
"a"
"f:"
"l"
@ -3986,6 +4293,14 @@ int main(int argc, char *argv[]) {
case 'r':
rescue = true;
break;
case 'u':
warmup = true;
break;
case 'U':
warmup = true;
warmup_flags =
MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock;
break;
default:
usage();
}
@ -4039,6 +4354,14 @@ int main(int argc, char *argv[]) {
goto env_close;
}
if (warmup) {
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
if (MDBX_IS_ERROR(rc)) {
error("mdbx_env_warmup", rc);
goto env_close;
}
}
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_begin", rc);

View File

@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -149,7 +149,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -216,7 +220,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -389,10 +393,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -416,8 +416,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1169,9 +1171,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1202,18 +1201,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1252,8 +1249,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1279,10 +1408,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1325,8 +1497,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1354,12 +1525,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1393,7 +1568,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1436,9 +1611,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1548,6 +1730,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1681,6 +1866,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1795,6 +1985,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1814,7 +2011,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1843,23 +2040,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1916,6 +2112,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1971,7 +2188,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2057,13 +2277,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2075,6 +2289,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2309,7 +2529,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2513,14 +2733,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2532,10 +2772,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2666,13 +2927,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2683,20 +2947,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2799,7 +3063,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2809,10 +3073,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2831,11 +3095,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2845,8 +3115,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2865,6 +3135,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2886,9 +3159,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2947,18 +3224,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2969,8 +3246,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3024,9 +3301,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3037,7 +3312,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3086,14 +3361,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3165,6 +3446,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3190,7 +3472,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3295,10 +3577,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3369,7 +3663,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3490,12 +3786,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3552,20 +3848,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}

View File

@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
#define MDBX_BUILD_SOURCERY e17be563de6f6f85e208ded5aacc1387bc0addf6ce5540c99d0d15db2c3e8edd_v0_12_2_0_g9b062cf0
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@ -149,7 +149,11 @@
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* __USE_MINGW_ANSI_STDIO */
#endif /* MinGW */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#include "mdbx.h"
/*
@ -216,7 +220,7 @@
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
@ -389,10 +393,6 @@ __extern_C key_t ftok(const char *, int);
#elif _WIN32_WINNT < 0x0500
#error At least 'Windows 2000' API is required for libmdbx.
#endif /* _WIN32_WINNT */
#if (defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
@ -416,8 +416,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/
@ -1169,9 +1171,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@ -1202,18 +1201,16 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
@ -1252,8 +1249,140 @@ typedef union osal_srwlock {
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_DIRECT 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
uint8_t flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
MDBX_MAYBE_UNUSED static inline int
osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const size_t npages = bytes >> ior->pagesize_ln2;
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
@ -1279,10 +1408,53 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000)
#if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else
#define MAX_WRITE UINT32_C(0x3fff0000)
#define MAX_WRITE UINT32_C(0x3f000000)
#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \
!defined(__ANDROID_API__)
#define MDBX_F_SETLK F_SETLK64
#define MDBX_F_SETLKW F_SETLKW64
#define MDBX_F_GETLK F_GETLK64
#if (__GLIBC_PREREQ(2, 28) && \
(defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \
defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \
defined(fcntl64)
#define MDBX_FCNTL fcntl64
#else
#define MDBX_FCNTL fcntl
#endif
#define MDBX_STRUCT_FLOCK struct flock64
#ifndef OFF_T_MAX
#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000)
#endif /* OFF_T_MAX */
#else
#define MDBX_F_SETLK F_SETLK
#define MDBX_F_SETLKW F_SETLKW
#define MDBX_F_GETLK F_GETLK
#define MDBX_FCNTL fcntl
#define MDBX_STRUCT_FLOCK struct flock
#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */
#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64) && !defined(__ANDROID_API__)
#define MDBX_F_OFD_SETLK F_OFD_SETLK64
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64
#define MDBX_F_OFD_GETLK F_OFD_GETLK64
#else
#define MDBX_F_OFD_SETLK F_OFD_SETLK
#define MDBX_F_OFD_SETLKW F_OFD_SETLKW
#define MDBX_F_OFD_GETLK F_OFD_GETLK
#ifndef OFF_T_MAX
#define OFF_T_MAX \
(((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff)
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
@ -1325,8 +1497,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset,
size_t expected_written);
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -1354,12 +1525,16 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0,
MDBX_OPEN_DXB_LAZY = 1,
MDBX_OPEN_DXB_DSYNC = 2,
MDBX_OPEN_LCK = 3,
MDBX_OPEN_COPY = 4,
MDBX_OPEN_DELETE = 5
MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC,
#if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DXB_OVERLAPPED_DIRECT,
#endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
};
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -1393,7 +1568,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -1436,9 +1611,16 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime);
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@ -1548,6 +1730,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
size_t src_n);
#define OSAL_MB2WIDE(FROM, TO) \
do { \
const char *const from_tmp = (FROM); \
@ -1681,6 +1866,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */
#endif /* !__cplusplus */
@ -1795,6 +1985,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_REFUND must be defined as 0 or 1
#endif /* MDBX_ENABLE_REFUND */
/** Controls profiling of GC search and updates. */
#ifndef MDBX_ENABLE_PROFGC
#define MDBX_ENABLE_PROFGC 0
#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1)
#error MDBX_ENABLE_PROFGC must be defined as 0 or 1
#endif /* MDBX_ENABLE_PROFGC */
/** Controls gathering statistics for page operations. */
#ifndef MDBX_ENABLE_PGOP_STAT
#define MDBX_ENABLE_PGOP_STAT 1
@ -1814,7 +2011,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
/** Controls use of POSIX madvise() hints and friends. */
/** Controls using of POSIX' madvise() and/or similar hints. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1
#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
@ -1843,23 +2040,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
/** Basically, this build-option is for TODO. Guess it should be replaced
* with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
* 0/OFF = Don't track dirty pages at all and don't spilling ones.
* This should be by-default on Linux and may-be other systems
* (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
* properly LRU tracking and async writing on-demand.
* 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
* spilling with msync(MS_ASYNC). */
#ifndef MDBX_FAKE_SPILL_WRITEMAP
#if defined(__linux__) || defined(__gnu_linux__)
#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP
* mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use
* msync() to persist data. This is by-default on Linux and other systems where
* kernel provides properly LRU tracking and effective flushing on-demand. 1/ON
* = Tracking of dirty pages but with LRU labels for spilling and explicit
* persist ones by write(). This may be reasonable for systems which low
* performance of msync() and/or LRU tracking. */
#ifndef MDBX_AVOID_MSYNC
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_AVOID_MSYNC 1
#else
#define MDBX_FAKE_SPILL_WRITEMAP 0
#define MDBX_AVOID_MSYNC 0
#endif
#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
#endif /* MDBX_FAKE_SPILL_WRITEMAP */
#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1)
#error MDBX_AVOID_MSYNC must be defined as 0 or 1
#endif /* MDBX_AVOID_MSYNC */
/** Controls sort order of internal page number lists.
* This mostly experimental/advanced option with not for regular MDBX users.
@ -1916,6 +2112,27 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#ifndef MDBX_HAVE_C11ATOMICS
#endif /* MDBX_HAVE_C11ATOMICS */
/** If defined then enables use the GCC's `__builtin_cpu_supports()`
* for runtime dispatching depending on the CPU's capabilities. */
#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS
#if defined(__APPLE__) || defined(BIONIC)
/* Never use any modern features on Apple's or Google's OSes
* since a lot of troubles with compatibility and/or performance */
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif defined(__e2k__)
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#elif __has_builtin(__builtin_cpu_supports) || \
defined(__BUILTIN_CPU_SUPPORTS__) || \
(defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1
#else
#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0
#endif
#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \
MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1)
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@ -1971,7 +2188,10 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/** Advanced: Using POSIX OFD-locks (autodetection by default). */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \
defined(F_OFD_GETLK)) || \
(defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \
defined(F_OFD_GETLK64))) && \
!defined(MDBX_SAFE4QEMU) && \
!defined(__sun) /* OFD-lock are broken on Solaris */
#define MDBX_USE_OFDLOCKS 1
@ -2057,13 +2277,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#if defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
@ -2075,6 +2289,12 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(ATOMIC_LLONG_LOCK_FREE)
#if ATOMIC_LLONG_LOCK_FREE > 1
#define MDBX_64BIT_CAS 1
#else
#define MDBX_64BIT_CAS 0
#endif
#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
#define MDBX_64BIT_CAS 1
#else
@ -2309,7 +2529,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -2513,14 +2733,34 @@ typedef struct MDBX_page {
: PAGETYPE_WHOLE(p))
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs)
#pragma pack(pop)
#if MDBX_ENABLE_PGOP_STAT
typedef struct profgc_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Монотонное время по "настенным часам" затраченное
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_monotonic;
/* Процессорное время в режим пользователя
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} profgc_stat_t;
/* Statistics of page operations overall of all (running, completed and aborted)
* transactions */
typedef struct {
typedef struct pgop_stat {
MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
@ -2532,10 +2772,31 @@ typedef struct {
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
profgc_stat_t work;
/* Затраты на поддержку и обновления самой GC */
profgc_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
@ -2666,13 +2927,16 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
* lock at leat one page, so therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
MDBX_pgop_stat_t mti_pgop_stat;
#endif /* MDBX_ENABLE_PGOP_STAT*/
pgop_stat_t mti_pgop_stat;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
@ -2683,20 +2947,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
@ -2799,7 +3063,7 @@ typedef struct MDBX_dp {
MDBX_page *ptr;
pgno_t pgno;
union {
unsigned extra;
uint32_t extra;
__anonymous_struct_extension__ struct {
unsigned multi : 1;
unsigned lru : 31;
@ -2809,10 +3073,10 @@ typedef struct MDBX_dp {
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
typedef struct MDBX_dpl {
unsigned sorted;
unsigned length;
unsigned pages_including_loose; /* number of pages, but not an entries. */
unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
size_t sorted;
size_t length;
size_t pages_including_loose; /* number of pages, but not an entries. */
size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
@ -2831,11 +3095,17 @@ typedef struct MDBX_dpl {
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_SIZE(pl) ((pl)[0])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
@ -2845,8 +3115,8 @@ typedef struct MDBX_dpl {
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
/*----------------------------------------------------------------------------*/
/* Internal structures */
@ -2865,6 +3135,9 @@ typedef struct MDBX_dbx {
typedef struct troika {
uint8_t fsm, recent, prefer_steady, tail_and_flags;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
uint32_t unused_pad;
#endif
#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
@ -2886,9 +3159,13 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -2947,18 +3224,18 @@ struct MDBX_txn {
struct {
meta_troika_t troika;
/* In write txns, array of cursors for each DB */
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t *relist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
#endif /* MDBX_ENABLE_REFUND */
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* a sequence to spilling dirty page with LRU policy */
unsigned dirtylru;
size_t dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
@ -2969,8 +3246,8 @@ struct MDBX_txn {
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
unsigned spill_least_removed;
size_t loose_count;
size_t spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
@ -3024,9 +3301,7 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_RECLAIMING 0x20 /* GC lookup is prohibited */
#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
uint8_t mc_flags; /* see mdbx_cursor */
uint8_t mc_flags;
/* Cursor checking flags. */
#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */
@ -3037,7 +3312,7 @@ struct MDBX_cursor {
#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
#define CC_RETIRING 0x40 /* refs to child pages may be invalid */
#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
uint8_t mc_checking; /* page checking level */
uint8_t mc_checking;
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
@ -3086,14 +3361,20 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
#define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck;
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */
atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for
@ -3165,6 +3446,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard;
@ -3190,7 +3472,7 @@ struct MDBX_env {
#define xMDBX_DEBUG_SPILLING 0
#endif
#if xMDBX_DEBUG_SPILLING == 2
unsigned debug_dirtied_est, debug_dirtied_act;
size_t debug_dirtied_est, debug_dirtied_act;
#endif /* xMDBX_DEBUG_SPILLING */
/* ------------------------------------------------- stub for lck-less mode */
@ -3295,10 +3577,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
mdbx_assert_fail(env, msg, __func__, __LINE__); \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
@ -3369,7 +3663,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
MDBX_INTERNAL_FUNC void global_ctor(void);
MDBX_INTERNAL_FUNC void osal_ctor(void);
MDBX_INTERNAL_FUNC void global_dtor(void);
MDBX_INTERNAL_FUNC void osal_dtor(void);
MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
#endif /* !__cplusplus */
@ -3490,12 +3786,12 @@ typedef struct MDBX_node {
#error "Oops, some flags overlapped or wrong"
#endif
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/*
* /
@ -3552,20 +3848,24 @@ ceil_powerof2(size_t value, size_t granularity) {
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
assert((value & -(int32_t)value) == value);
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
return __builtin_ctzl(value);
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
_BitScanForward(&index, (unsigned long)value);
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}