Skip to content

Instantly share code, notes, and snippets.

@RedBeard0531
Created April 22, 2026 07:31
Show Gist options
  • Select an option

  • Save RedBeard0531/83e397c112cd121b03c3d31db5894047 to your computer and use it in GitHub Desktop.

Select an option

Save RedBeard0531/83e397c112cd121b03c3d31db5894047 to your computer and use it in GitHub Desktop.
tcmalloc rseq fixes for linux 6.19
diff --git a/tcmalloc/cpu_cache.h b/tcmalloc/cpu_cache.h
index 3ea24065168..a460ebb5227 100644
--- a/tcmalloc/cpu_cache.h
+++ b/tcmalloc/cpu_cache.h
@@ -726,12 +726,12 @@ static cpu_set_t FillActiveCpuMask() {
}
#ifdef PERCPU_USE_RSEQ
- const bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus();
+ constexpr bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus();
#else
- const bool real_cpus = true;
+ constexpr bool real_cpus = true;
#endif
- if (real_cpus) {
+ if constexpr (real_cpus) {
return allowed_cpus;
}
diff --git a/tcmalloc/cpu_cache_test.cc b/tcmalloc/cpu_cache_test.cc
index e90e8e6b904..adc37e4ce76 100644
--- a/tcmalloc/cpu_cache_test.cc
+++ b/tcmalloc/cpu_cache_test.cc
@@ -398,9 +398,8 @@ TEST(CpuCacheTest, Metadata) {
int allowed_cpu_id;
const size_t kSizeClass = 2;
const size_t num_to_move = cache.forwarder().num_objects_to_move(kSizeClass);
- const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus()
- ? offsetof(kernel_rseq, vcpu_id)
- : offsetof(kernel_rseq, cpu_id);
+ constexpr size_t virtual_cpu_id_offset =
+ subtle::percpu::VirtualCpuIdFieldOffset();
void* ptr;
{
// Restrict this thread to a single core while allocating and processing the
@@ -528,9 +527,8 @@ TEST(CpuCacheTest, CacheMissStats) {
int allowed_cpu_id;
const size_t kSizeClass = 2;
- const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus()
- ? offsetof(kernel_rseq, vcpu_id)
- : offsetof(kernel_rseq, cpu_id);
+ constexpr size_t virtual_cpu_id_offset =
+ subtle::percpu::VirtualCpuIdFieldOffset();
void* ptr;
{
// Restrict this thread to a single core while allocating and processing the
diff --git a/tcmalloc/internal/percpu.cc b/tcmalloc/internal/percpu.cc
index 2973408bec0..4e0f408c5f5 100644
--- a/tcmalloc/internal/percpu.cc
+++ b/tcmalloc/internal/percpu.cc
@@ -87,10 +87,6 @@ static bool InitThreadPerCpu() {
return false;
}
-bool UsingFlatVirtualCpus() {
- return false;
-}
-
static void InitPerCpu() {
TC_CHECK(NumCPUs() <= std::numeric_limits<uint16_t>::max());
@@ -317,12 +313,14 @@ static void FenceInterruptCPU(int cpu) {
SlowFence(cpu);
}
-void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) {
+void FenceCpu(int cpu) {
// Prevent compiler re-ordering of code below. In particular, the call to
// GetCurrentCpu must not appear in assembly program order until after any
// code that comes before FenceCpu in C++ program order.
CompilerBarrier();
+ constexpr size_t virtual_cpu_id_offset = VirtualCpuIdFieldOffset();
+
// A useful fast path: nothing needs doing at all to order us with respect
// to our own CPU.
if (ABSL_PREDICT_TRUE(IsFastNoInit()) &&
@@ -330,7 +328,7 @@ void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) {
return;
}
- if (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) {
+ if constexpr (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) {
ASSUME(false);
// With virtual CPUs, we cannot identify the true physical core we need to
diff --git a/tcmalloc/internal/percpu.h b/tcmalloc/internal/percpu.h
index b592d9b6ea8..d9f83bb4dc7 100644
--- a/tcmalloc/internal/percpu.h
+++ b/tcmalloc/internal/percpu.h
@@ -25,6 +25,14 @@
// Offset from __rseq_abi to the cached slabs address.
#define TCMALLOC_RSEQ_SLABS_OFFSET -4
+// Bit 0 of header[0] (size class 0) marks a slab as valid/active.
+// After MADV_DONTNEED zeroes old slab pages, the bit stays clear,
+// causing the in-CS check to bail out. SetStopFlag and InitCpuImpl keep
+// a slab invalid with 0; ClearStopFlag publishes it by setting bit 0 once
+// the slab is fully initialized and visible to fast paths.
+#define TCMALLOC_SLAB_VALID_BIT 0
+#define TCMALLOC_SLAB_VALID_MASK 0x1
+
// The bit denotes that tcmalloc_rseq.slabs contains valid slabs offset.
#define TCMALLOC_CACHED_SLABS_BIT 63
#define TCMALLOC_CACHED_SLABS_MASK_SHIFT (1ul << TCMALLOC_CACHED_SLABS_BIT)
@@ -224,7 +232,17 @@ size_t TcmallocSlab_Internal_PopBatch(size_t size_class, void** batch,
// virtue of C linkage) in the supported case.
// Return whether we are using flat virtual CPUs.
-bool UsingFlatVirtualCpus();
+inline constexpr bool UsingFlatVirtualCpus() { return false; }
+
+// Byte offset from &__rseq_abi to the CPU id field used for slab indexing
+// (physical cpu_id vs flat vcpu_id), matching TcmallocSlab::virtual_cpu_id_offset_.
+inline constexpr size_t VirtualCpuIdFieldOffset() {
+ if constexpr (UsingFlatVirtualCpus()) {
+ return offsetof(kernel_rseq, vcpu_id);
+ } else {
+ return offsetof(kernel_rseq, cpu_id);
+ }
+}
enum class RseqVcpuMode { kNone };
inline RseqVcpuMode GetRseqVcpuMode() { return RseqVcpuMode::kNone; }
@@ -278,8 +296,10 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) {
return cpu;
}
- // Do not return a physical CPU ID when we expect a virtual CPU ID.
- TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id));
+ if constexpr (UsingFlatVirtualCpus()) {
+ // Do not return a physical CPU ID when we expect a virtual CPU ID.
+ TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id));
+ }
#ifdef TCMALLOC_HAVE_SCHED_GETCPU
cpu = sched_getcpu();
@@ -290,9 +310,7 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) {
}
inline int GetCurrentVirtualCpuUnsafe() {
- const size_t offset = UsingFlatVirtualCpus() ? offsetof(kernel_rseq, vcpu_id)
- : offsetof(kernel_rseq, cpu_id);
- return GetCurrentVirtualCpuUnsafe(offset);
+ return GetCurrentVirtualCpuUnsafe(VirtualCpuIdFieldOffset());
}
bool InitFastPerCpu();
@@ -375,7 +393,7 @@ inline void TSANReleaseBatch(void** batch, int n) {
#endif
}
-void FenceCpu(int cpu, const size_t virtual_cpu_id_offset);
+void FenceCpu(int cpu);
void FenceAllCpus();
} // namespace percpu
diff --git a/tcmalloc/internal/percpu_rseq_aarch64.S b/tcmalloc/internal/percpu_rseq_aarch64.S
index d6a684c130f..82d4d31a83b 100644
--- a/tcmalloc/internal/percpu_rseq_aarch64.S
+++ b/tcmalloc/internal/percpu_rseq_aarch64.S
@@ -115,6 +115,7 @@
label##_trampoline: \
CFI(.cfi_startproc); \
BTI_C; \
+ str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]; \
b .L##label##_abort; \
CFI(.cfi_endproc); \
.size label##_trampoline, . - label##_trampoline; \
@@ -231,6 +232,8 @@ TcmallocSlab_Internal_PushBatch:
FETCH_SLABS(x8)
tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PushBatch_no_capacity
and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK
+ ldapr w15, [x8]
+ tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PushBatch_magic_fail
add x15, x8, x0, LSL #2 /* r15 = hdr */
ldrh w9, [x15] /* r9 = current */
ldrh w10, [x15, #2] /* r10 = end */
@@ -261,6 +264,8 @@ TcmallocSlab_Internal_PushBatch:
.LTcmallocSlab_Internal_PushBatch_commit:
mov x0, x10
ret
+.LTcmallocSlab_Internal_PushBatch_magic_fail:
+ str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]
.LTcmallocSlab_Internal_PushBatch_no_capacity:
mov x0, #0
ret
@@ -303,6 +308,8 @@ TcmallocSlab_Internal_PopBatch:
FETCH_SLABS(x8)
tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PopBatch_no_items
and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK
+ ldapr w15, [x8]
+ tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PopBatch_magic_fail
add x15, x8, x0, LSL #2
ldrh w9, [x15] /* current */
ldrh w10, [x3] /* begin */
@@ -333,6 +340,8 @@ TcmallocSlab_Internal_PopBatch:
.LTcmallocSlab_Internal_PopBatch_commit:
mov x0, x11
ret
+.LTcmallocSlab_Internal_PopBatch_magic_fail:
+ str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]
.LTcmallocSlab_Internal_PopBatch_no_items:
mov x0, #0
ret
diff --git a/tcmalloc/internal/percpu_rseq_x86_64.S b/tcmalloc/internal/percpu_rseq_x86_64.S
index 797fec5572f..f3724663055 100644
--- a/tcmalloc/internal/percpu_rseq_x86_64.S
+++ b/tcmalloc/internal/percpu_rseq_x86_64.S
@@ -98,6 +98,7 @@
.type label##_trampoline, @function; \
label##_trampoline: \
CFI(.cfi_startproc); \
+ CLEAR_SLABS_CACHE; \
jmp .L##label##_abort; \
CFI(.cfi_endproc); \
.size label##_trampoline, . - label##_trampoline;
@@ -137,6 +138,8 @@ label##_trampoline: \
movzwl %fs:__rseq_abi@TPOFF(offset), dest;
#define FETCH_SLABS(dest) \
movq %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET, dest
+#define CLEAR_SLABS_CACHE \
+ movq $0, %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET;
#define START_RSEQ(src) \
.L##src##_abort: \
leaq __rseq_cs_##src(%rip), %rax; \
@@ -154,6 +157,9 @@ label##_trampoline: \
*/
#define FETCH_CPU(dest, offset) movzwl (%rax, offset), dest;
#define FETCH_SLABS(dest) movq TCMALLOC_RSEQ_SLABS_OFFSET(%rax), dest
+#define CLEAR_SLABS_CACHE \
+ call tcmalloc_internal_tls_fetch_pic@PLT; \
+ movq $0, TCMALLOC_RSEQ_SLABS_OFFSET(%rax);
#define START_RSEQ(src) \
.L##src##_abort: \
call tcmalloc_internal_tls_fetch_pic@PLT; \
@@ -243,6 +249,8 @@ TcmallocSlab_Internal_PushBatch:
FETCH_SLABS(%r8);
btrq $TCMALLOC_CACHED_SLABS_BIT, %r8;
jnc .LTcmallocSlab_Internal_PushBatch_full;
+ testb $TCMALLOC_SLAB_VALID_MASK, (%r8);
+ jz .LTcmallocSlab_Internal_PushBatch_magic_fail;
movzwq (%r8, %rdi, 4), %r9; /* current */
movzwq 2(%r8, %rdi, 4), %r10; /* end */
cmpq %r10, %r9;
@@ -264,6 +272,8 @@ TcmallocSlab_Internal_PushBatch:
movq %rdx, %rax;
subq %r11, %rax;
ret;
+.LTcmallocSlab_Internal_PushBatch_magic_fail:
+ CLEAR_SLABS_CACHE;
.LTcmallocSlab_Internal_PushBatch_full:
xor %rax, %rax;
ret;
@@ -305,6 +315,8 @@ TcmallocSlab_Internal_PopBatch:
xorq %rax, %rax;
btrq $TCMALLOC_CACHED_SLABS_BIT, %r8;
jnc .LTcmallocSlab_Internal_PopBatch_commit;
+ testb $TCMALLOC_SLAB_VALID_MASK, (%r8);
+ jz .LTcmallocSlab_Internal_PopBatch_magic_fail;
movzwq (%r8, %rdi, 4), %r9; /* current */
movzwq (%rcx), %r10; /* begin */
cmp %r10, %r9;
@@ -323,6 +335,10 @@ TcmallocSlab_Internal_PopBatch:
movw %r9w, (%r8, %rdi, 4);
.LTcmallocSlab_Internal_PopBatch_commit:
ret;
+.LTcmallocSlab_Internal_PopBatch_magic_fail:
+ CLEAR_SLABS_CACHE;
+ xor %rax, %rax; /* rax is clobbered by CLEAR_SLABS_CACHE on the PIC path. */
+ ret;
CFI(.cfi_endproc)
ENCODE_SIZE(TcmallocSlab_Internal_PopBatch)
DEFINE_UPSTREAM_CS(TcmallocSlab_Internal_PopBatch)
diff --git a/tcmalloc/internal/percpu_tcmalloc.cc b/tcmalloc/internal/percpu_tcmalloc.cc
index a68e39d0a4f..3014b97b6c8 100644
--- a/tcmalloc/internal/percpu_tcmalloc.cc
+++ b/tcmalloc/internal/percpu_tcmalloc.cc
@@ -17,8 +17,8 @@
#include <algorithm>
#include <atomic>
#include <cstddef>
-#include <cstdint>
#include <limits>
+#include <cstdint>
#include <new>
#include <utility>
@@ -43,9 +43,6 @@ void TcmallocSlab::Init(
absl::FunctionRef<size_t(size_t)> capacity, Shift shift) {
TC_ASSERT(num_classes_ == 0 && num_classes != 0);
num_classes_ = num_classes;
- if (UsingFlatVirtualCpus()) {
- virtual_cpu_id_offset_ = offsetof(kernel_rseq, vcpu_id);
- }
stopped_ = new (alloc(sizeof(stopped_[0]) * NumCPUs(),
std::align_val_t{ABSL_CACHELINE_SIZE}))
std::atomic<bool>[NumCPUs()];
@@ -100,8 +97,11 @@ void TcmallocSlab::InitCpuImpl(void* slabs, Shift shift, int cpu,
TC_CHECK_LE((1 << ToUint8(shift)), (1 << 16) * sizeof(void*));
// Initialize prefetch target and compute the offsets for the
- // boundaries of each size class' cache.
+ // boundaries of each size class' cache. Keep the slab invalid until the
+ // caller explicitly publishes it with ClearStopFlag().
void* curr_slab = CpuMemoryStart(slabs, shift, cpu);
+ static_cast<std::atomic<int32_t>*>(curr_slab)->store(
+ 0, std::memory_order_relaxed);
void** elems = reinterpret_cast<void**>(
(reinterpret_cast<uintptr_t>(GetHeader(slabs, shift, cpu, num_classes_)) +
sizeof(void*) - 1) &
@@ -213,7 +213,9 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
absl::FunctionRef<bool(size_t)> populated,
DrainHandler drain_handler) -> ResizeSlabsInfo {
// Phase 1: Stop all CPUs and initialize any CPUs in the new slab that have
- // already been populated in the old slab.
+ // already been populated in the old slab. Keep the new slab invalid until
+ // phase 4 so a stale cached pointer cannot mistake a reused slab address for
+ // a published one during A -> B -> A reuse.
const auto [old_slabs, old_shift] =
GetSlabsAndShift(std::memory_order_relaxed);
TC_ASSERT_NE(new_shift, old_shift);
@@ -221,6 +223,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
for (size_t cpu = 0; cpu < num_cpus; ++cpu) {
TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed));
stopped_[cpu].store(true, std::memory_order_relaxed);
+ SetStopFlag(cpu);
if (populated(cpu)) {
InitCpuImpl(new_slabs, new_shift, cpu, capacity);
}
@@ -238,6 +241,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
// Phase 4: Re-start all CPUs.
for (size_t cpu = 0; cpu < num_cpus; ++cpu) {
+ ClearStopFlag(cpu);
stopped_[cpu].store(false, std::memory_order_release);
}
@@ -308,16 +312,34 @@ void TcmallocSlab::Drain(int cpu, DrainHandler drain_handler) {
DrainCpu(slabs, shift, cpu, drain_handler);
}
+void TcmallocSlab::SetStopFlag(int cpu) {
+ const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
+ auto* flag = static_cast<std::atomic<int32_t>*>(
+ CpuMemoryStart(slabs, shift, cpu));
+ flag->store(0, std::memory_order_relaxed);
+}
+
+void TcmallocSlab::ClearStopFlag(int cpu) {
+ const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
+ auto* flag = static_cast<std::atomic<int32_t>*>(
+ CpuMemoryStart(slabs, shift, cpu));
+ // Publish completed remote slab updates before allowing fast paths that
+ // still have a cached slab pointer to proceed based on header[0].
+ flag->store(TCMALLOC_SLAB_VALID_MASK, std::memory_order_release);
+}
+
void TcmallocSlab::StopCpu(int cpu) {
TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu);
TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed));
stopped_[cpu].store(true, std::memory_order_relaxed);
- FenceCpu(cpu, virtual_cpu_id_offset_);
+ SetStopFlag(cpu);
+ FenceCpu(cpu);
}
void TcmallocSlab::StartCpu(int cpu) {
TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu);
TC_ASSERT(stopped_[cpu].load(std::memory_order_relaxed));
+ ClearStopFlag(cpu);
stopped_[cpu].store(false, std::memory_order_release);
}
diff --git a/tcmalloc/internal/percpu_tcmalloc.h b/tcmalloc/internal/percpu_tcmalloc.h
index b5cd584a7bd..bbc74af712d 100644
--- a/tcmalloc/internal/percpu_tcmalloc.h
+++ b/tcmalloc/internal/percpu_tcmalloc.h
@@ -41,7 +41,9 @@
#include "tcmalloc/internal/prefetch.h"
#include "tcmalloc/internal/sysinfo.h"
-#if __clang_major__ >= 11
+// GCC supports asm goto, but has at least gcc11 has a codegen bug on x86_64.
+#if (defined(__GNUC__) && !defined(__clang__) && !defined(__x86_64__)) || \
+ (defined(__clang__) && __clang_major__ >= 11)
#define TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT 1
#endif
@@ -227,6 +229,8 @@ class TcmallocSlab {
// synchronization protocol.
void StopCpu(int cpu);
void StartCpu(int cpu);
+ void SetStopFlag(int cpu);
+ void ClearStopFlag(int cpu);
// Grows the cpu/size_class slab's capacity to no greater than
// min(capacity+len, max_capacity(<shift>)) and returns the increment
@@ -355,7 +359,7 @@ class TcmallocSlab {
// so that we can atomically update both with a single store.
std::atomic<SlabsAndShift> slabs_and_shift_{};
// This is in units of bytes.
- size_t virtual_cpu_id_offset_ = offsetof(kernel_rseq, cpu_id);
+ static constexpr size_t virtual_cpu_id_offset_ = VirtualCpuIdFieldOffset();
// Remote Cpu operation (Resize/Drain/Grow/Shrink) is running so any local
// operations (Push/Pop) should fail.
std::atomic<bool>* stopped_ = nullptr;
@@ -399,6 +403,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
#if defined(__x86_64__)
#define TCMALLOC_RSEQ_RELOC_TYPE "R_X86_64_NONE"
#define TCMALLOC_RSEQ_JUMP "jmp"
+#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "movq $0, %[rseq_slabs_addr]\n"
#if !defined(__PIC__) && !defined(__PIE__)
#define TCMALLOC_RSEQ_SET_CS(name) \
"movq $__rseq_cs_" #name "_%=, %[rseq_cs_addr]\n"
@@ -430,6 +435,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
#define TCMALLOC_RSEQ_CLOBBER "x16", "x17"
#define TCMALLOC_RSEQ_RELOC_TYPE "R_AARCH64_NONE"
#define TCMALLOC_RSEQ_JUMP "b"
+#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "str xzr, %[rseq_slabs_addr]\n"
#define TCMALLOC_RSEQ_SET_CS(name) \
TCMALLOC_RSEQ_TRAMPLINE_SMASH \
"adrp %[scratch], __rseq_cs_" #name \
@@ -490,7 +496,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
"_trampoline_%=,@function\n" \
"" #name \
"_trampoline_%=:\n" \
- "2:\n" TCMALLOC_RSEQ_JUMP \
+ "2:\n" TCMALLOC_RSEQ_CLEAR_SLABS_CACHE TCMALLOC_RSEQ_JUMP \
" 3f\n" \
".size " #name "_trampoline_%=, . - " #name \
"_trampoline_%=\n" \
@@ -508,7 +514,9 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
is no cost to passing unused \
consts. */ \
[cached_slabs_bit] "n"(TCMALLOC_CACHED_SLABS_BIT), \
- [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK)
+ [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK), \
+ [slab_valid_bit] "n"(TCMALLOC_SLAB_VALID_BIT), \
+ [slab_valid_mask] "n"(TCMALLOC_SLAB_VALID_MASK)
// Store v to p (*p = v) if the current thread wasn't rescheduled
// (still has the slab pointer cached). Otherwise returns false.
@@ -566,6 +574,73 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool StoreCurrentCpu(volatile void* p,
return scratch;
}
+// Store new_v to p if the current thread still has the expected cached slab,
+// the current slab is still active, and *p still matches old_v. Otherwise
+// returns false.
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool CompareAndSwapCurrentCpuChecked(
+ std::atomic<int32_t>* p, int32_t old_v, int32_t new_v,
+ uintptr_t expected_slabs) {
+ uintptr_t scratch = 0;
+#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__x86_64__)
+ uintptr_t slab;
+ uint32_t observed;
+ asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked)
+ R"(
+ xorq %[scratch], %[scratch]
+ movq %[rseq_slabs_addr], %[slab]
+ cmpq %[expected_slabs], %[slab]
+ jne 7f
+ btrq $%c[cached_slabs_bit], %[slab]
+ jnc 5f
+ testb $%c[slab_valid_mask], (%[slab])
+ jz 7f
+ movl %[p], %[observed]
+ cmpl %[old_v], %[observed]
+ jne 5f
+ movl $1, %k[scratch]
+ movl %[new_v], %[p]
+ jmp 5f
+ 7:
+ movq $0, %[rseq_slabs_addr]
+ 5 :)"
+ : [scratch] "=&r"(scratch), [slab] "=&r"(slab),
+ [observed] "=&r"(observed)
+ : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs),
+ [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v),
+ [new_v] "r"(new_v)
+ : "cc", "memory");
+#elif TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__aarch64__)
+ uintptr_t slab;
+ uint32_t observed;
+ asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked)
+ R"(
+ mov %[scratch], #0
+ ldr %[slab], %[rseq_slabs_addr]
+ cmp %[slab], %[expected_slabs]
+ b.ne 7f
+ tbz %[slab], #%c[cached_slabs_bit], 5f
+ and %[slab], %[slab], #%c[cached_slabs_mask_neg]
+ ldapr %w[observed], [%[slab]]
+ tbz %w[observed], #%c[slab_valid_bit], 7f
+ ldr %w[observed], %[p]
+ cmp %w[observed], %w[old_v]
+ b.ne 5f
+ mov %[scratch], #1
+ str %w[new_v], %[p]
+ b 5f
+ 7:
+ str xzr, %[rseq_slabs_addr]
+ 5 :)"
+ : [scratch] "=&r"(scratch), [slab] "=&r"(slab),
+ [observed] "=&r"(observed)
+ : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs),
+ [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v),
+ [new_v] "r"(new_v)
+ : TCMALLOC_RSEQ_CLOBBER, "cc", "memory");
+#endif
+ return scratch;
+}
+
// Prefetch slabs memory for the case of repeated pushes/pops.
// Note: this prefetch slows down micro-benchmarks, but provides ~0.1-0.5%
// speedup for larger real applications.
@@ -591,8 +666,16 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
"btrq $%c[cached_slabs_bit], %[scratch]\n"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
"jnc %l[overflow_label]\n"
+ "testb $%c[slab_valid_mask], (%[scratch])\n"
+ "jz %l[magic_fail]\n"
#else
- "jae 5f\n" // ae==c
+ "jae 5f\n"
+ "testb $%c[slab_valid_mask], (%[scratch])\n"
+ "jnz 7f\n"
+ "movq $0, %[rseq_slabs_addr]\n"
+ "clc\n"
+ "jmp 5f\n"
+ "7:\n"
#endif
// current = slabs->current;
"movzwq (%[scratch], %[size_class], 4), %[current]\n"
@@ -618,7 +701,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
: TCMALLOC_RSEQ_INPUTS, [size_class] "r"(size_class), [item] "r"(item)
: "cc", "memory"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
- : overflow_label
+ : overflow_label, magic_fail
#endif
);
#if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
@@ -629,6 +712,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
// Current now points to the slot we are going to push to next.
PrefetchSlabMemory(scratch + current * sizeof(void*));
return true;
+magic_fail:
+ tcmalloc_slabs = 0;
overflow_label:
return false;
}
@@ -652,9 +737,17 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
"tbz %[region_start], #%c[cached_slabs_bit], %l[overflow_label]\n"
"and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n"
+ "ldapr %w[scratch], [%[region_start]]\n"
+ "tbz %w[scratch], #%c[slab_valid_bit], %l[magic_fail]\n"
#else
"subs %[region_start], %[region_start], %[cached_slabs_mask]\n"
"b.ls 5f\n"
+ "ldapr %w[scratch], [%[region_start]]\n"
+ "tbnz %w[scratch], #%c[slab_valid_bit], 7f\n"
+ "str xzr, %[rseq_slabs_addr]\n"
+ "cmp %w[scratch], %w[scratch]\n"
+ "b 5f\n"
+ "7:\n"
#endif
// end_ptr = &(slab_headers[0]->end)
"add %[end_ptr], %[region_start], #2\n"
@@ -689,7 +782,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
: TCMALLOC_RSEQ_CLOBBER, "memory"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
, "cc"
- : overflow_label
+ : overflow_label, magic_fail
#endif
);
#if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
@@ -698,6 +791,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
}
#endif
return true;
+magic_fail:
+ tcmalloc_slabs = 0;
overflow_label:
return false;
}
@@ -762,9 +857,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
"btrq $%c[cached_slabs_bit], %[scratch]\n"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
"jnc %l[underflow_path]\n"
+ "testb $%c[slab_valid_mask], (%[scratch])\n"
+ "jz %l[magic_fail]\n"
#else
"cmc\n"
"jc 5f\n"
+ "testb $%c[slab_valid_mask], (%[scratch])\n"
+ "jnz 7f\n"
+ "movq $0, %[rseq_slabs_addr]\n"
+ "stc\n"
+ "jmp 5f\n"
+ "7:\n"
#endif
// current = scratch->header[size_class].current;
"movzwq (%[scratch], %[size_class], 4), %[current]\n"
@@ -797,7 +900,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
[size_class] "r"(size_class)
: "cc", "memory"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
- : underflow_path
+ : underflow_path, magic_fail
#endif
);
#if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
@@ -814,6 +917,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
PrefetchSlabMemory(scratch + (current - 2) * sizeof(void*));
PrefetchNextObject(next);
return AssumeNotNull(result);
+magic_fail:
+ tcmalloc_slabs = 0;
underflow_path:
return nullptr;
}
@@ -845,6 +950,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
"b.eq 5f\n"
#endif
"and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n"
+#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
+ "ldapr %w[previous], [%[region_start]]\n"
+ "tbz %w[previous], #%c[slab_valid_bit], %l[magic_fail]\n"
+#else
+ "ldapr %w[previous], [%[region_start]]\n"
+ "tbnz %w[previous], #%c[slab_valid_bit], 7f\n"
+ "str xzr, %[rseq_slabs_addr]\n"
+ "cmp %w[previous], %w[previous]\n"
+ "b 5f\n"
+ "7:\n"
+#endif
// scratch = slab_headers[size_class]->current (current index)
"ldrh %w[scratch], [%[region_start], %[size_class_lsl2]]\n"
// scratch--
@@ -885,7 +1001,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
: TCMALLOC_RSEQ_CLOBBER, "memory"
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
, "cc"
- : underflow_path
+ : underflow_path, magic_fail
#endif
);
#if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
@@ -896,6 +1012,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
TSANAcquire(result);
PrefetchNextObject(prefetch);
return AssumeNotNull(result);
+magic_fail:
+ tcmalloc_slabs = 0;
underflow_path:
return nullptr;
}
@@ -912,6 +1030,9 @@ inline size_t TcmallocSlab::Grow(
absl::FunctionRef<size_t(uint8_t)> max_capacity) {
const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
const size_t max_cap = max_capacity(ToUint8(shift));
+ const uintptr_t expected_slabs =
+ reinterpret_cast<uintptr_t>(CpuMemoryStart(slabs, shift, cpu)) |
+ TCMALLOC_CACHED_SLABS_MASK;
auto* hdrp = GetHeader(slabs, shift, cpu, size_class);
Header hdr = LoadHeader(hdrp);
uint16_t begin = begins_[size_class].load(std::memory_order_relaxed);
@@ -919,9 +1040,19 @@ inline size_t TcmallocSlab::Grow(
if (have <= 0) {
return 0;
}
+ // Grow preloads the header outside the critical section, so the commit must
+ // validate both the cached slab identity and that the header value we read is
+ // still current. Otherwise a stop/start cycle can restore the valid bit and
+ // let us overwrite a remote GrowOtherCache/ShrinkOtherCache update with stale
+ // data.
+ const auto old_hdr = hdr;
uint16_t n = std::min<uint16_t>(len, have);
hdr.end += n;
- return StoreCurrentCpu(hdrp, hdr) ? n : 0;
+ return CompareAndSwapCurrentCpuChecked(hdrp, absl::bit_cast<int32_t>(old_hdr),
+ absl::bit_cast<int32_t>(hdr),
+ expected_slabs)
+ ? n
+ : 0;
}
inline std::pair<int, bool> TcmallocSlab::CacheCpuSlab() {
diff --git a/tcmalloc/testing/background_test.cc b/tcmalloc/testing/background_test.cc
index a6b6e008126..91a16ae1e43 100644
--- a/tcmalloc/testing/background_test.cc
+++ b/tcmalloc/testing/background_test.cc
@@ -18,7 +18,7 @@
#include <atomic>
#include <thread>
-#include "gtest/gtest.h"
+#include "mongo/unittest/unittest.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/malloc_extension.h"
@@ -30,13 +30,12 @@ namespace tcmalloc {
namespace {
TEST(BackgroundTest, Defaults) {
- EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
- EXPECT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(),
+ ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
+ ASSERT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(),
absl::Seconds(1));
}
TEST(BackgroundTest, Stress) {
- // Process background actions by setting a custom sleep interval.
struct ProcessActions {
static void Go() {
constexpr absl::Duration kSleepTime = absl::Milliseconds(10);
@@ -45,8 +44,7 @@ TEST(BackgroundTest, Stress) {
}
};
- // Make sure that background acions are indeed enabled.
- EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
+ ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
std::thread background(ProcessActions::Go);
@@ -66,9 +64,4 @@ TEST(BackgroundTest, Stress) {
}
} // namespace
-} // namespace tcmalloc
-
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
+} // namespace tcmalloc
\ No newline at end of file
diff --git a/tcmalloc/testing/double_alloc_test.cc b/tcmalloc/testing/double_alloc_test.cc
new file mode 100644
index 00000000000..e00536dfbb2
--- /dev/null
+++ b/tcmalloc/testing/double_alloc_test.cc
@@ -0,0 +1,191 @@
+// Copyright 2025 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Stress test to detect double-allocation caused by the Linux 6.19 rseq bug.
+//
+// On Linux 6.19, membarrier RSEQ IPI no longer writes cpu_id_start.
+// This breaks tcmalloc's StopCpu protocol: ShrinkOtherCache/DrainCpu can
+// read slab objects concurrently with a Pop on the same CPU, giving two
+// callers the same pointer (silent heap corruption).
+//
+// Detection: each allocation is stamped with a per-thread canary. If another
+// thread receives the same pointer, it overwrites the canary. The original
+// owner detects this on its next verification pass.
+
+#include <sched.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cstdio>
+#include <thread>
+#include <vector>
+
+#include "mongo/unittest/unittest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/malloc_extension.h"
+
+namespace tcmalloc {
+namespace {
+
+constexpr int kNumThreads = 16;
+constexpr int kMaxLivePerThread = 800;
+constexpr absl::Duration kTestDuration = absl::Seconds(30);
+
+constexpr size_t kAllocSizes[] = {16, 32, 48, 64, 80, 128, 256};
+constexpr int kNumSizes = sizeof(kAllocSizes) / sizeof(kAllocSizes[0]);
+
+struct Alloc {
+ void* ptr;
+ size_t size;
+ uint64_t canary;
+};
+
+static uint64_t MakeCanary(int tid, uint64_t counter) {
+ return (static_cast<uint64_t>(tid + 1) << 48) | (counter & 0xFFFFFFFFFFFFULL);
+}
+
+static int CanaryTid(uint64_t canary) {
+ return static_cast<int>(canary >> 48) - 1;
+}
+
+static void StampAlloc(void* ptr, size_t size, uint64_t canary) {
+ auto* p = static_cast<volatile uint64_t*>(ptr);
+ size_t n = size / sizeof(uint64_t);
+ for (size_t i = 0; i < n; ++i) {
+ p[i] = canary;
+ }
+}
+
+static bool VerifyAlloc(const Alloc& a) {
+ auto* p = static_cast<volatile uint64_t*>(a.ptr);
+ return p[0] == a.canary;
+}
+
+TEST(DoubleAllocTest, DetectCorruption) {
+ MallocExtension::SetBackgroundProcessSleepInterval(absl::Milliseconds(1));
+
+ std::thread background([] {
+ MallocExtension::ProcessBackgroundActions();
+ });
+
+ std::atomic<bool> stop{false};
+ std::atomic<int> canary_corruptions{0};
+ std::atomic<uint64_t> total_allocs{0};
+
+ std::vector<std::thread> threads;
+ for (int tid = 0; tid < kNumThreads; ++tid) {
+ threads.emplace_back([&, tid] {
+ std::vector<Alloc> live;
+ live.reserve(kMaxLivePerThread + 128);
+ uint64_t counter = 0;
+ uint32_t rng = tid * 2654435761u + 1;
+
+ while (!stop.load(std::memory_order_relaxed)) {
+ rng = rng * 1103515245 + 12345;
+ size_t alloc_size = kAllocSizes[rng % kNumSizes];
+
+ for (int i = 0; i < 64 && static_cast<int>(live.size()) < kMaxLivePerThread; ++i) {
+ void* p = ::operator new(alloc_size);
+ uint64_t canary = MakeCanary(tid, ++counter);
+ StampAlloc(p, alloc_size, canary);
+ live.push_back({p, alloc_size, canary});
+ total_allocs.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ for (size_t i = 0; i < live.size(); ++i) {
+ if (!VerifyAlloc(live[i])) {
+ auto* p = static_cast<volatile uint64_t*>(live[i].ptr);
+ uint64_t found = p[0];
+ int found_tid = CanaryTid(found);
+ int expected_tid = CanaryTid(live[i].canary);
+ int corruptions =
+ canary_corruptions.fetch_add(1, std::memory_order_relaxed) + 1;
+ fprintf(stderr,
+ "*** DOUBLE ALLOCATION DETECTED (#%d) ***\n"
+ " ptr=%p size=%zu\n"
+ " expected canary=0x%016lx (tid=%d)\n"
+ " found canary=0x%016lx (tid=%d)\n",
+ corruptions, live[i].ptr, live[i].size,
+ (unsigned long)live[i].canary, expected_tid,
+ (unsigned long)found, found_tid);
+ live[i].ptr = nullptr;
+ stop.store(true, std::memory_order_relaxed);
+ }
+ }
+
+ size_t w = 0;
+ for (size_t r = 0; r < live.size(); ++r) {
+ if (live[r].ptr != nullptr) {
+ if (w != r) live[w] = live[r];
+ ++w;
+ }
+ }
+ live.resize(w);
+
+ rng = rng * 1103515245 + 12345;
+ int to_free = live.size() / 2;
+ for (int i = 0; i < to_free; ++i) {
+ auto& a = live.back();
+ if (a.ptr) {
+ if (!VerifyAlloc(a)) {
+ auto* p = static_cast<volatile uint64_t*>(a.ptr);
+ uint64_t found = p[0];
+ canary_corruptions.fetch_add(1, std::memory_order_relaxed);
+ fprintf(stderr,
+ "*** DOUBLE ALLOCATION DETECTED (at free) ***\n"
+ " ptr=%p expected=0x%016lx found=0x%016lx\n",
+ a.ptr, (unsigned long)a.canary, (unsigned long)found);
+ a.ptr = nullptr;
+ } else {
+ ::operator delete(a.ptr, a.size);
+ }
+ }
+ live.pop_back();
+ }
+ }
+
+ if (canary_corruptions.load(std::memory_order_relaxed) == 0) {
+ for (auto& a : live) {
+ if (a.ptr) ::operator delete(a.ptr, a.size);
+ }
+ }
+ });
+ }
+
+ absl::SleepFor(kTestDuration);
+ stop.store(true, std::memory_order_relaxed);
+
+ for (auto& t : threads) t.join();
+
+ MallocExtension::SetBackgroundProcessActionsEnabled(false);
+ background.join();
+
+ uint64_t ops = total_allocs.load();
+ int corruptions = canary_corruptions.load();
+ fprintf(stderr,
+ "\n=== Results ===\n"
+ "Total allocations: %lu\n"
+ "Canary corruptions (double allocations): %d\n",
+ (unsigned long)ops, corruptions);
+
+ if (corruptions > 0) {
+ FAIL("Double allocation detected") << ": " << corruptions << " corruptions out of " << ops << " allocations";
+ }
+}
+
+} // namespace
+} // namespace tcmalloc
\ No newline at end of file
diff --git a/tcmalloc/testing/test_allocator_harness.h b/tcmalloc/testing/test_allocator_harness.h
index e3333d9c7b7..d637b9e3419 100644
--- a/tcmalloc/testing/test_allocator_harness.h
+++ b/tcmalloc/testing/test_allocator_harness.h
@@ -20,7 +20,7 @@
#include <utility>
#include <vector>
-#include "gtest/gtest.h"
+#include "mongo/unittest/unittest.h"
#include "absl/base/optimization.h"
#include "absl/random/random.h"
#include "absl/synchronization/mutex.h"
diff --git a/tcmalloc/testing/testutil.h b/tcmalloc/testing/testutil.h
index 4d02ccde8b3..f9ef104e14c 100644
--- a/tcmalloc/testing/testutil.h
+++ b/tcmalloc/testing/testutil.h
@@ -239,7 +239,7 @@ class ScopedFakeCpuId {
// modifying __rseq_abi, we can inject our own CPU ID.
tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id = cpu_id;
- if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
+ if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id = cpu_id;
}
#endif
@@ -252,7 +252,7 @@ class ScopedFakeCpuId {
tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id =
tcmalloc_internal::subtle::percpu::kCpuIdUninitialized;
- if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
+ if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id =
tcmalloc_internal::subtle::percpu::kCpuIdUninitialized;
}
diff --git a/tcmalloc/testing/thread_manager.h b/tcmalloc/testing/thread_manager.h
index 7f7acb3d1d4..0a462ce02ac 100644
--- a/tcmalloc/testing/thread_manager.h
+++ b/tcmalloc/testing/thread_manager.h
@@ -20,7 +20,7 @@
#include <thread>
#include <vector>
-#include "gtest/gtest.h"
+#include "mongo/unittest/unittest.h"
#include "absl/synchronization/blocking_counter.h"
namespace tcmalloc {
@@ -29,7 +29,7 @@ class ThreadManager {
public:
ThreadManager() : shutdown_(false) {}
~ThreadManager() {
- EXPECT_TRUE(shutdown_.load()) << "ThreadManager not stopped";
+ ASSERT_TRUE(shutdown_.load());
}
// Invokes `func` repeatedly on each of `n` threads until `Stop` is called.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment