feat(mdbx): record backtrace of read transactions and log on timeout (#13707)

This commit is contained in:
Alexey Shekhirin
2025-01-27 10:53:06 +00:00
committed by GitHub
parent bdb3969287
commit 6342162826

View File

@ -110,6 +110,7 @@ mod read_transactions {
};
use dashmap::{DashMap, DashSet};
use std::{
backtrace::Backtrace,
sync::{mpsc::sync_channel, Arc},
time::{Duration, Instant},
};
@ -148,11 +149,10 @@ mod read_transactions {
}
/// Removes a transaction from the list of active read transactions.
pub(crate) fn remove_active_read_transaction(
&self,
ptr: *mut ffi::MDBX_txn,
) -> Option<(usize, (TransactionPtr, Instant))> {
self.read_transactions.as_ref()?.remove_active(ptr)
///
/// Returns `true` if the transaction was found and removed.
pub(crate) fn remove_active_read_transaction(&self, ptr: *mut ffi::MDBX_txn) -> bool {
self.read_transactions.as_ref().is_some_and(|txs| txs.remove_active(ptr))
}
/// Returns the number of timed out transactions that were not aborted by the user yet.
@ -172,7 +172,10 @@ mod read_transactions {
///
/// We store `usize` instead of a raw pointer as a key, because pointers are not
/// comparable. The time of transaction opening is stored as a value.
active: DashMap<usize, (TransactionPtr, Instant)>,
///
/// The backtrace of the transaction opening is recorded only when debug assertions are
/// enabled.
active: DashMap<usize, (TransactionPtr, Instant, Option<Arc<Backtrace>>)>,
/// List of timed out transactions that were not aborted by the user yet, hence have a
/// dangling read transaction pointer.
timed_out_not_aborted: DashSet<usize>,
@ -185,16 +188,20 @@ mod read_transactions {
/// Adds a new transaction to the list of active read transactions.
pub(super) fn add_active(&self, ptr: *mut ffi::MDBX_txn, tx: TransactionPtr) {
let _ = self.active.insert(ptr as usize, (tx, Instant::now()));
let _ = self.active.insert(
ptr as usize,
(
tx,
Instant::now(),
cfg!(debug_assertions).then(|| Arc::new(Backtrace::force_capture())),
),
);
}
/// Removes a transaction from the list of active read transactions.
pub(super) fn remove_active(
&self,
ptr: *mut ffi::MDBX_txn,
) -> Option<(usize, (TransactionPtr, Instant))> {
pub(super) fn remove_active(&self, ptr: *mut ffi::MDBX_txn) -> bool {
self.timed_out_not_aborted.remove(&(ptr as usize));
self.active.remove(&(ptr as usize))
self.active.remove(&(ptr as usize)).is_some()
}
/// Returns the number of timed out transactions that were not aborted by the user yet.
@ -215,7 +222,7 @@ mod read_transactions {
// Iterate through active read transactions and time out those that's open for
// longer than `self.max_duration`.
for entry in &self.active {
let (tx, start) = entry.value();
let (tx, start, backtrace) = entry.value();
let duration = now - *start;
if duration > self.max_duration {
@ -241,10 +248,15 @@ mod read_transactions {
// Add the transaction to `timed_out_active`. We can't remove it
// instantly from the list of active transactions, because we
// iterate through it.
timed_out_active.push((txn_ptr, duration, error));
timed_out_active.push((
txn_ptr,
duration,
backtrace.clone(),
error,
));
}
Err(err) => {
error!(target: "libmdbx", %err, "Failed to abort the long-lived read transaction")
error!(target: "libmdbx", %err, ?backtrace, "Failed to abort the long-lived read transaction")
}
}
} else {
@ -256,18 +268,18 @@ mod read_transactions {
// Walk through timed out transactions, and delete them from the list of active
// transactions.
for (ptr, open_duration, err) in timed_out_active.iter().copied() {
for (ptr, open_duration, backtrace, err) in timed_out_active.iter().cloned() {
// Try deleting the transaction from the list of active transactions.
let was_in_active = self.remove_active(ptr).is_some();
let was_in_active = self.remove_active(ptr);
if let Err(err) = err {
if was_in_active {
// If the transaction was in the list of active transactions,
// then user didn't abort it and we failed to do so.
error!(target: "libmdbx", %err, ?open_duration, "Failed to time out the long-lived read transaction");
error!(target: "libmdbx", %err, ?open_duration, ?backtrace, "Failed to time out the long-lived read transaction");
}
} else {
// Happy path, the transaction has been timed out by us with no errors.
warn!(target: "libmdbx", ?open_duration, "Long-lived read transaction has been timed out");
warn!(target: "libmdbx", ?open_duration, ?backtrace, "Long-lived read transaction has been timed out");
// Add transaction to the list of timed out transactions that were not
// aborted by the user yet.
self.timed_out_not_aborted.insert(ptr as usize);
@ -283,7 +295,7 @@ mod read_transactions {
target: "libmdbx",
elapsed = ?now.elapsed(),
active = ?self.active.iter().map(|entry| {
let (tx, start) = entry.value();
let (tx, start, _) = entry.value();
(tx.clone(), start.elapsed())
}).collect::<Vec<_>>(),
"Read transactions"