Created
April 22, 2026 07:31
-
-
Save RedBeard0531/83e397c112cd121b03c3d31db5894047 to your computer and use it in GitHub Desktop.
tcmalloc rseq fixes for linux 6.19
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/tcmalloc/cpu_cache.h b/tcmalloc/cpu_cache.h | |
| index 3ea24065168..a460ebb5227 100644 | |
| --- a/tcmalloc/cpu_cache.h | |
| +++ b/tcmalloc/cpu_cache.h | |
| @@ -726,12 +726,12 @@ static cpu_set_t FillActiveCpuMask() { | |
| } | |
| #ifdef PERCPU_USE_RSEQ | |
| - const bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus(); | |
| + constexpr bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus(); | |
| #else | |
| - const bool real_cpus = true; | |
| + constexpr bool real_cpus = true; | |
| #endif | |
| - if (real_cpus) { | |
| + if constexpr (real_cpus) { | |
| return allowed_cpus; | |
| } | |
| diff --git a/tcmalloc/cpu_cache_test.cc b/tcmalloc/cpu_cache_test.cc | |
| index e90e8e6b904..adc37e4ce76 100644 | |
| --- a/tcmalloc/cpu_cache_test.cc | |
| +++ b/tcmalloc/cpu_cache_test.cc | |
| @@ -398,9 +398,8 @@ TEST(CpuCacheTest, Metadata) { | |
| int allowed_cpu_id; | |
| const size_t kSizeClass = 2; | |
| const size_t num_to_move = cache.forwarder().num_objects_to_move(kSizeClass); | |
| - const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus() | |
| - ? offsetof(kernel_rseq, vcpu_id) | |
| - : offsetof(kernel_rseq, cpu_id); | |
| + constexpr size_t virtual_cpu_id_offset = | |
| + subtle::percpu::VirtualCpuIdFieldOffset(); | |
| void* ptr; | |
| { | |
| // Restrict this thread to a single core while allocating and processing the | |
| @@ -528,9 +527,8 @@ TEST(CpuCacheTest, CacheMissStats) { | |
| int allowed_cpu_id; | |
| const size_t kSizeClass = 2; | |
| - const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus() | |
| - ? offsetof(kernel_rseq, vcpu_id) | |
| - : offsetof(kernel_rseq, cpu_id); | |
| + constexpr size_t virtual_cpu_id_offset = | |
| + subtle::percpu::VirtualCpuIdFieldOffset(); | |
| void* ptr; | |
| { | |
| // Restrict this thread to a single core while allocating and processing the | |
| diff --git a/tcmalloc/internal/percpu.cc b/tcmalloc/internal/percpu.cc | |
| index 2973408bec0..4e0f408c5f5 100644 | |
| --- a/tcmalloc/internal/percpu.cc | |
| +++ b/tcmalloc/internal/percpu.cc | |
| @@ -87,10 +87,6 @@ static bool InitThreadPerCpu() { | |
| return false; | |
| } | |
| -bool UsingFlatVirtualCpus() { | |
| - return false; | |
| -} | |
| - | |
| static void InitPerCpu() { | |
| TC_CHECK(NumCPUs() <= std::numeric_limits<uint16_t>::max()); | |
| @@ -317,12 +313,14 @@ static void FenceInterruptCPU(int cpu) { | |
| SlowFence(cpu); | |
| } | |
| -void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) { | |
| +void FenceCpu(int cpu) { | |
| // Prevent compiler re-ordering of code below. In particular, the call to | |
| // GetCurrentCpu must not appear in assembly program order until after any | |
| // code that comes before FenceCpu in C++ program order. | |
| CompilerBarrier(); | |
| + constexpr size_t virtual_cpu_id_offset = VirtualCpuIdFieldOffset(); | |
| + | |
| // A useful fast path: nothing needs doing at all to order us with respect | |
| // to our own CPU. | |
| if (ABSL_PREDICT_TRUE(IsFastNoInit()) && | |
| @@ -330,7 +328,7 @@ void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) { | |
| return; | |
| } | |
| - if (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) { | |
| + if constexpr (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) { | |
| ASSUME(false); | |
| // With virtual CPUs, we cannot identify the true physical core we need to | |
| diff --git a/tcmalloc/internal/percpu.h b/tcmalloc/internal/percpu.h | |
| index b592d9b6ea8..d9f83bb4dc7 100644 | |
| --- a/tcmalloc/internal/percpu.h | |
| +++ b/tcmalloc/internal/percpu.h | |
| @@ -25,6 +25,14 @@ | |
| // Offset from __rseq_abi to the cached slabs address. | |
| #define TCMALLOC_RSEQ_SLABS_OFFSET -4 | |
| +// Bit 0 of header[0] (size class 0) marks a slab as valid/active. | |
| +// After MADV_DONTNEED zeroes old slab pages, the bit stays clear, | |
| +// causing the in-CS check to bail out. SetStopFlag and InitCpuImpl keep | |
| +// a slab invalid with 0; ClearStopFlag publishes it by setting bit 0 once | |
| +// the slab is fully initialized and visible to fast paths. | |
| +#define TCMALLOC_SLAB_VALID_BIT 0 | |
| +#define TCMALLOC_SLAB_VALID_MASK 0x1 | |
| + | |
| // The bit denotes that tcmalloc_rseq.slabs contains valid slabs offset. | |
| #define TCMALLOC_CACHED_SLABS_BIT 63 | |
| #define TCMALLOC_CACHED_SLABS_MASK_SHIFT (1ul << TCMALLOC_CACHED_SLABS_BIT) | |
| @@ -224,7 +232,17 @@ size_t TcmallocSlab_Internal_PopBatch(size_t size_class, void** batch, | |
| // virtue of C linkage) in the supported case. | |
| // Return whether we are using flat virtual CPUs. | |
| -bool UsingFlatVirtualCpus(); | |
| +inline constexpr bool UsingFlatVirtualCpus() { return false; } | |
| + | |
| +// Byte offset from &__rseq_abi to the CPU id field used for slab indexing | |
| +// (physical cpu_id vs flat vcpu_id), matching TcmallocSlab::virtual_cpu_id_offset_. | |
| +inline constexpr size_t VirtualCpuIdFieldOffset() { | |
| + if constexpr (UsingFlatVirtualCpus()) { | |
| + return offsetof(kernel_rseq, vcpu_id); | |
| + } else { | |
| + return offsetof(kernel_rseq, cpu_id); | |
| + } | |
| +} | |
| enum class RseqVcpuMode { kNone }; | |
| inline RseqVcpuMode GetRseqVcpuMode() { return RseqVcpuMode::kNone; } | |
| @@ -278,8 +296,10 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) { | |
| return cpu; | |
| } | |
| - // Do not return a physical CPU ID when we expect a virtual CPU ID. | |
| - TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id)); | |
| + if constexpr (UsingFlatVirtualCpus()) { | |
| + // Do not return a physical CPU ID when we expect a virtual CPU ID. | |
| + TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id)); | |
| + } | |
| #ifdef TCMALLOC_HAVE_SCHED_GETCPU | |
| cpu = sched_getcpu(); | |
| @@ -290,9 +310,7 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) { | |
| } | |
| inline int GetCurrentVirtualCpuUnsafe() { | |
| - const size_t offset = UsingFlatVirtualCpus() ? offsetof(kernel_rseq, vcpu_id) | |
| - : offsetof(kernel_rseq, cpu_id); | |
| - return GetCurrentVirtualCpuUnsafe(offset); | |
| + return GetCurrentVirtualCpuUnsafe(VirtualCpuIdFieldOffset()); | |
| } | |
| bool InitFastPerCpu(); | |
| @@ -375,7 +393,7 @@ inline void TSANReleaseBatch(void** batch, int n) { | |
| #endif | |
| } | |
| -void FenceCpu(int cpu, const size_t virtual_cpu_id_offset); | |
| +void FenceCpu(int cpu); | |
| void FenceAllCpus(); | |
| } // namespace percpu | |
| diff --git a/tcmalloc/internal/percpu_rseq_aarch64.S b/tcmalloc/internal/percpu_rseq_aarch64.S | |
| index d6a684c130f..82d4d31a83b 100644 | |
| --- a/tcmalloc/internal/percpu_rseq_aarch64.S | |
| +++ b/tcmalloc/internal/percpu_rseq_aarch64.S | |
| @@ -115,6 +115,7 @@ | |
| label##_trampoline: \ | |
| CFI(.cfi_startproc); \ | |
| BTI_C; \ | |
| + str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]; \ | |
| b .L##label##_abort; \ | |
| CFI(.cfi_endproc); \ | |
| .size label##_trampoline, . - label##_trampoline; \ | |
| @@ -231,6 +232,8 @@ TcmallocSlab_Internal_PushBatch: | |
| FETCH_SLABS(x8) | |
| tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PushBatch_no_capacity | |
| and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK | |
| + ldapr w15, [x8] | |
| + tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PushBatch_magic_fail | |
| add x15, x8, x0, LSL #2 /* r15 = hdr */ | |
| ldrh w9, [x15] /* r9 = current */ | |
| ldrh w10, [x15, #2] /* r10 = end */ | |
| @@ -261,6 +264,8 @@ TcmallocSlab_Internal_PushBatch: | |
| .LTcmallocSlab_Internal_PushBatch_commit: | |
| mov x0, x10 | |
| ret | |
| +.LTcmallocSlab_Internal_PushBatch_magic_fail: | |
| + str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET] | |
| .LTcmallocSlab_Internal_PushBatch_no_capacity: | |
| mov x0, #0 | |
| ret | |
| @@ -303,6 +308,8 @@ TcmallocSlab_Internal_PopBatch: | |
| FETCH_SLABS(x8) | |
| tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PopBatch_no_items | |
| and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK | |
| + ldapr w15, [x8] | |
| + tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PopBatch_magic_fail | |
| add x15, x8, x0, LSL #2 | |
| ldrh w9, [x15] /* current */ | |
| ldrh w10, [x3] /* begin */ | |
| @@ -333,6 +340,8 @@ TcmallocSlab_Internal_PopBatch: | |
| .LTcmallocSlab_Internal_PopBatch_commit: | |
| mov x0, x11 | |
| ret | |
| +.LTcmallocSlab_Internal_PopBatch_magic_fail: | |
| + str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET] | |
| .LTcmallocSlab_Internal_PopBatch_no_items: | |
| mov x0, #0 | |
| ret | |
| diff --git a/tcmalloc/internal/percpu_rseq_x86_64.S b/tcmalloc/internal/percpu_rseq_x86_64.S | |
| index 797fec5572f..f3724663055 100644 | |
| --- a/tcmalloc/internal/percpu_rseq_x86_64.S | |
| +++ b/tcmalloc/internal/percpu_rseq_x86_64.S | |
| @@ -98,6 +98,7 @@ | |
| .type label##_trampoline, @function; \ | |
| label##_trampoline: \ | |
| CFI(.cfi_startproc); \ | |
| + CLEAR_SLABS_CACHE; \ | |
| jmp .L##label##_abort; \ | |
| CFI(.cfi_endproc); \ | |
| .size label##_trampoline, . - label##_trampoline; | |
| @@ -137,6 +138,8 @@ label##_trampoline: \ | |
| movzwl %fs:__rseq_abi@TPOFF(offset), dest; | |
| #define FETCH_SLABS(dest) \ | |
| movq %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET, dest | |
| +#define CLEAR_SLABS_CACHE \ | |
| + movq $0, %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET; | |
| #define START_RSEQ(src) \ | |
| .L##src##_abort: \ | |
| leaq __rseq_cs_##src(%rip), %rax; \ | |
| @@ -154,6 +157,9 @@ label##_trampoline: \ | |
| */ | |
| #define FETCH_CPU(dest, offset) movzwl (%rax, offset), dest; | |
| #define FETCH_SLABS(dest) movq TCMALLOC_RSEQ_SLABS_OFFSET(%rax), dest | |
| +#define CLEAR_SLABS_CACHE \ | |
| + call tcmalloc_internal_tls_fetch_pic@PLT; \ | |
| + movq $0, TCMALLOC_RSEQ_SLABS_OFFSET(%rax); | |
| #define START_RSEQ(src) \ | |
| .L##src##_abort: \ | |
| call tcmalloc_internal_tls_fetch_pic@PLT; \ | |
| @@ -243,6 +249,8 @@ TcmallocSlab_Internal_PushBatch: | |
| FETCH_SLABS(%r8); | |
| btrq $TCMALLOC_CACHED_SLABS_BIT, %r8; | |
| jnc .LTcmallocSlab_Internal_PushBatch_full; | |
| + testb $TCMALLOC_SLAB_VALID_MASK, (%r8); | |
| + jz .LTcmallocSlab_Internal_PushBatch_magic_fail; | |
| movzwq (%r8, %rdi, 4), %r9; /* current */ | |
| movzwq 2(%r8, %rdi, 4), %r10; /* end */ | |
| cmpq %r10, %r9; | |
| @@ -264,6 +272,8 @@ TcmallocSlab_Internal_PushBatch: | |
| movq %rdx, %rax; | |
| subq %r11, %rax; | |
| ret; | |
| +.LTcmallocSlab_Internal_PushBatch_magic_fail: | |
| + CLEAR_SLABS_CACHE; | |
| .LTcmallocSlab_Internal_PushBatch_full: | |
| xor %rax, %rax; | |
| ret; | |
| @@ -305,6 +315,8 @@ TcmallocSlab_Internal_PopBatch: | |
| xorq %rax, %rax; | |
| btrq $TCMALLOC_CACHED_SLABS_BIT, %r8; | |
| jnc .LTcmallocSlab_Internal_PopBatch_commit; | |
| + testb $TCMALLOC_SLAB_VALID_MASK, (%r8); | |
| + jz .LTcmallocSlab_Internal_PopBatch_magic_fail; | |
| movzwq (%r8, %rdi, 4), %r9; /* current */ | |
| movzwq (%rcx), %r10; /* begin */ | |
| cmp %r10, %r9; | |
| @@ -323,6 +335,10 @@ TcmallocSlab_Internal_PopBatch: | |
| movw %r9w, (%r8, %rdi, 4); | |
| .LTcmallocSlab_Internal_PopBatch_commit: | |
| ret; | |
| +.LTcmallocSlab_Internal_PopBatch_magic_fail: | |
| + CLEAR_SLABS_CACHE; | |
| + xor %rax, %rax; /* rax is clobbered by CLEAR_SLABS_CACHE on the PIC path. */ | |
| + ret; | |
| CFI(.cfi_endproc) | |
| ENCODE_SIZE(TcmallocSlab_Internal_PopBatch) | |
| DEFINE_UPSTREAM_CS(TcmallocSlab_Internal_PopBatch) | |
| diff --git a/tcmalloc/internal/percpu_tcmalloc.cc b/tcmalloc/internal/percpu_tcmalloc.cc | |
| index a68e39d0a4f..3014b97b6c8 100644 | |
| --- a/tcmalloc/internal/percpu_tcmalloc.cc | |
| +++ b/tcmalloc/internal/percpu_tcmalloc.cc | |
| @@ -17,8 +17,8 @@ | |
| #include <algorithm> | |
| #include <atomic> | |
| #include <cstddef> | |
| -#include <cstdint> | |
| #include <limits> | |
| +#include <cstdint> | |
| #include <new> | |
| #include <utility> | |
| @@ -43,9 +43,6 @@ void TcmallocSlab::Init( | |
| absl::FunctionRef<size_t(size_t)> capacity, Shift shift) { | |
| TC_ASSERT(num_classes_ == 0 && num_classes != 0); | |
| num_classes_ = num_classes; | |
| - if (UsingFlatVirtualCpus()) { | |
| - virtual_cpu_id_offset_ = offsetof(kernel_rseq, vcpu_id); | |
| - } | |
| stopped_ = new (alloc(sizeof(stopped_[0]) * NumCPUs(), | |
| std::align_val_t{ABSL_CACHELINE_SIZE})) | |
| std::atomic<bool>[NumCPUs()]; | |
| @@ -100,8 +97,11 @@ void TcmallocSlab::InitCpuImpl(void* slabs, Shift shift, int cpu, | |
| TC_CHECK_LE((1 << ToUint8(shift)), (1 << 16) * sizeof(void*)); | |
| // Initialize prefetch target and compute the offsets for the | |
| - // boundaries of each size class' cache. | |
| + // boundaries of each size class' cache. Keep the slab invalid until the | |
| + // caller explicitly publishes it with ClearStopFlag(). | |
| void* curr_slab = CpuMemoryStart(slabs, shift, cpu); | |
| + static_cast<std::atomic<int32_t>*>(curr_slab)->store( | |
| + 0, std::memory_order_relaxed); | |
| void** elems = reinterpret_cast<void**>( | |
| (reinterpret_cast<uintptr_t>(GetHeader(slabs, shift, cpu, num_classes_)) + | |
| sizeof(void*) - 1) & | |
| @@ -213,7 +213,9 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs, | |
| absl::FunctionRef<bool(size_t)> populated, | |
| DrainHandler drain_handler) -> ResizeSlabsInfo { | |
| // Phase 1: Stop all CPUs and initialize any CPUs in the new slab that have | |
| - // already been populated in the old slab. | |
| + // already been populated in the old slab. Keep the new slab invalid until | |
| + // phase 4 so a stale cached pointer cannot mistake a reused slab address for | |
| + // a published one during A -> B -> A reuse. | |
| const auto [old_slabs, old_shift] = | |
| GetSlabsAndShift(std::memory_order_relaxed); | |
| TC_ASSERT_NE(new_shift, old_shift); | |
| @@ -221,6 +223,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs, | |
| for (size_t cpu = 0; cpu < num_cpus; ++cpu) { | |
| TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed)); | |
| stopped_[cpu].store(true, std::memory_order_relaxed); | |
| + SetStopFlag(cpu); | |
| if (populated(cpu)) { | |
| InitCpuImpl(new_slabs, new_shift, cpu, capacity); | |
| } | |
| @@ -238,6 +241,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs, | |
| // Phase 4: Re-start all CPUs. | |
| for (size_t cpu = 0; cpu < num_cpus; ++cpu) { | |
| + ClearStopFlag(cpu); | |
| stopped_[cpu].store(false, std::memory_order_release); | |
| } | |
| @@ -308,16 +312,34 @@ void TcmallocSlab::Drain(int cpu, DrainHandler drain_handler) { | |
| DrainCpu(slabs, shift, cpu, drain_handler); | |
| } | |
| +void TcmallocSlab::SetStopFlag(int cpu) { | |
| + const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed); | |
| + auto* flag = static_cast<std::atomic<int32_t>*>( | |
| + CpuMemoryStart(slabs, shift, cpu)); | |
| + flag->store(0, std::memory_order_relaxed); | |
| +} | |
| + | |
| +void TcmallocSlab::ClearStopFlag(int cpu) { | |
| + const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed); | |
| + auto* flag = static_cast<std::atomic<int32_t>*>( | |
| + CpuMemoryStart(slabs, shift, cpu)); | |
| + // Publish completed remote slab updates before allowing fast paths that | |
| + // still have a cached slab pointer to proceed based on header[0]. | |
| + flag->store(TCMALLOC_SLAB_VALID_MASK, std::memory_order_release); | |
| +} | |
| + | |
| void TcmallocSlab::StopCpu(int cpu) { | |
| TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu); | |
| TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed)); | |
| stopped_[cpu].store(true, std::memory_order_relaxed); | |
| - FenceCpu(cpu, virtual_cpu_id_offset_); | |
| + SetStopFlag(cpu); | |
| + FenceCpu(cpu); | |
| } | |
| void TcmallocSlab::StartCpu(int cpu) { | |
| TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu); | |
| TC_ASSERT(stopped_[cpu].load(std::memory_order_relaxed)); | |
| + ClearStopFlag(cpu); | |
| stopped_[cpu].store(false, std::memory_order_release); | |
| } | |
| diff --git a/tcmalloc/internal/percpu_tcmalloc.h b/tcmalloc/internal/percpu_tcmalloc.h | |
| index b5cd584a7bd..bbc74af712d 100644 | |
| --- a/tcmalloc/internal/percpu_tcmalloc.h | |
| +++ b/tcmalloc/internal/percpu_tcmalloc.h | |
| @@ -41,7 +41,9 @@ | |
| #include "tcmalloc/internal/prefetch.h" | |
| #include "tcmalloc/internal/sysinfo.h" | |
| -#if __clang_major__ >= 11 | |
| +// GCC supports asm goto, but has at least gcc11 has a codegen bug on x86_64. | |
| +#if (defined(__GNUC__) && !defined(__clang__) && !defined(__x86_64__)) || \ | |
| + (defined(__clang__) && __clang_major__ >= 11) | |
| #define TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT 1 | |
| #endif | |
| @@ -227,6 +229,8 @@ class TcmallocSlab { | |
| // synchronization protocol. | |
| void StopCpu(int cpu); | |
| void StartCpu(int cpu); | |
| + void SetStopFlag(int cpu); | |
| + void ClearStopFlag(int cpu); | |
| // Grows the cpu/size_class slab's capacity to no greater than | |
| // min(capacity+len, max_capacity(<shift>)) and returns the increment | |
| @@ -355,7 +359,7 @@ class TcmallocSlab { | |
| // so that we can atomically update both with a single store. | |
| std::atomic<SlabsAndShift> slabs_and_shift_{}; | |
| // This is in units of bytes. | |
| - size_t virtual_cpu_id_offset_ = offsetof(kernel_rseq, cpu_id); | |
| + static constexpr size_t virtual_cpu_id_offset_ = VirtualCpuIdFieldOffset(); | |
| // Remote Cpu operation (Resize/Drain/Grow/Shrink) is running so any local | |
| // operations (Push/Pop) should fail. | |
| std::atomic<bool>* stopped_ = nullptr; | |
| @@ -399,6 +403,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const { | |
| #if defined(__x86_64__) | |
| #define TCMALLOC_RSEQ_RELOC_TYPE "R_X86_64_NONE" | |
| #define TCMALLOC_RSEQ_JUMP "jmp" | |
| +#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "movq $0, %[rseq_slabs_addr]\n" | |
| #if !defined(__PIC__) && !defined(__PIE__) | |
| #define TCMALLOC_RSEQ_SET_CS(name) \ | |
| "movq $__rseq_cs_" #name "_%=, %[rseq_cs_addr]\n" | |
| @@ -430,6 +435,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const { | |
| #define TCMALLOC_RSEQ_CLOBBER "x16", "x17" | |
| #define TCMALLOC_RSEQ_RELOC_TYPE "R_AARCH64_NONE" | |
| #define TCMALLOC_RSEQ_JUMP "b" | |
| +#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "str xzr, %[rseq_slabs_addr]\n" | |
| #define TCMALLOC_RSEQ_SET_CS(name) \ | |
| TCMALLOC_RSEQ_TRAMPLINE_SMASH \ | |
| "adrp %[scratch], __rseq_cs_" #name \ | |
| @@ -490,7 +496,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const { | |
| "_trampoline_%=,@function\n" \ | |
| "" #name \ | |
| "_trampoline_%=:\n" \ | |
| - "2:\n" TCMALLOC_RSEQ_JUMP \ | |
| + "2:\n" TCMALLOC_RSEQ_CLEAR_SLABS_CACHE TCMALLOC_RSEQ_JUMP \ | |
| " 3f\n" \ | |
| ".size " #name "_trampoline_%=, . - " #name \ | |
| "_trampoline_%=\n" \ | |
| @@ -508,7 +514,9 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const { | |
| is no cost to passing unused \ | |
| consts. */ \ | |
| [cached_slabs_bit] "n"(TCMALLOC_CACHED_SLABS_BIT), \ | |
| - [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK) | |
| + [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK), \ | |
| + [slab_valid_bit] "n"(TCMALLOC_SLAB_VALID_BIT), \ | |
| + [slab_valid_mask] "n"(TCMALLOC_SLAB_VALID_MASK) | |
| // Store v to p (*p = v) if the current thread wasn't rescheduled | |
| // (still has the slab pointer cached). Otherwise returns false. | |
| @@ -566,6 +574,73 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool StoreCurrentCpu(volatile void* p, | |
| return scratch; | |
| } | |
| +// Store new_v to p if the current thread still has the expected cached slab, | |
| +// the current slab is still active, and *p still matches old_v. Otherwise | |
| +// returns false. | |
| +inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool CompareAndSwapCurrentCpuChecked( | |
| + std::atomic<int32_t>* p, int32_t old_v, int32_t new_v, | |
| + uintptr_t expected_slabs) { | |
| + uintptr_t scratch = 0; | |
| +#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__x86_64__) | |
| + uintptr_t slab; | |
| + uint32_t observed; | |
| + asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked) | |
| + R"( | |
| + xorq %[scratch], %[scratch] | |
| + movq %[rseq_slabs_addr], %[slab] | |
| + cmpq %[expected_slabs], %[slab] | |
| + jne 7f | |
| + btrq $%c[cached_slabs_bit], %[slab] | |
| + jnc 5f | |
| + testb $%c[slab_valid_mask], (%[slab]) | |
| + jz 7f | |
| + movl %[p], %[observed] | |
| + cmpl %[old_v], %[observed] | |
| + jne 5f | |
| + movl $1, %k[scratch] | |
| + movl %[new_v], %[p] | |
| + jmp 5f | |
| + 7: | |
| + movq $0, %[rseq_slabs_addr] | |
| + 5 :)" | |
| + : [scratch] "=&r"(scratch), [slab] "=&r"(slab), | |
| + [observed] "=&r"(observed) | |
| + : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs), | |
| + [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v), | |
| + [new_v] "r"(new_v) | |
| + : "cc", "memory"); | |
| +#elif TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__aarch64__) | |
| + uintptr_t slab; | |
| + uint32_t observed; | |
| + asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked) | |
| + R"( | |
| + mov %[scratch], #0 | |
| + ldr %[slab], %[rseq_slabs_addr] | |
| + cmp %[slab], %[expected_slabs] | |
| + b.ne 7f | |
| + tbz %[slab], #%c[cached_slabs_bit], 5f | |
| + and %[slab], %[slab], #%c[cached_slabs_mask_neg] | |
| + ldapr %w[observed], [%[slab]] | |
| + tbz %w[observed], #%c[slab_valid_bit], 7f | |
| + ldr %w[observed], %[p] | |
| + cmp %w[observed], %w[old_v] | |
| + b.ne 5f | |
| + mov %[scratch], #1 | |
| + str %w[new_v], %[p] | |
| + b 5f | |
| + 7: | |
| + str xzr, %[rseq_slabs_addr] | |
| + 5 :)" | |
| + : [scratch] "=&r"(scratch), [slab] "=&r"(slab), | |
| + [observed] "=&r"(observed) | |
| + : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs), | |
| + [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v), | |
| + [new_v] "r"(new_v) | |
| + : TCMALLOC_RSEQ_CLOBBER, "cc", "memory"); | |
| +#endif | |
| + return scratch; | |
| +} | |
| + | |
| // Prefetch slabs memory for the case of repeated pushes/pops. | |
| // Note: this prefetch slows down micro-benchmarks, but provides ~0.1-0.5% | |
| // speedup for larger real applications. | |
| @@ -591,8 +666,16 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| "btrq $%c[cached_slabs_bit], %[scratch]\n" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| "jnc %l[overflow_label]\n" | |
| + "testb $%c[slab_valid_mask], (%[scratch])\n" | |
| + "jz %l[magic_fail]\n" | |
| #else | |
| - "jae 5f\n" // ae==c | |
| + "jae 5f\n" | |
| + "testb $%c[slab_valid_mask], (%[scratch])\n" | |
| + "jnz 7f\n" | |
| + "movq $0, %[rseq_slabs_addr]\n" | |
| + "clc\n" | |
| + "jmp 5f\n" | |
| + "7:\n" | |
| #endif | |
| // current = slabs->current; | |
| "movzwq (%[scratch], %[size_class], 4), %[current]\n" | |
| @@ -618,7 +701,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| : TCMALLOC_RSEQ_INPUTS, [size_class] "r"(size_class), [item] "r"(item) | |
| : "cc", "memory" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| - : overflow_label | |
| + : overflow_label, magic_fail | |
| #endif | |
| ); | |
| #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| @@ -629,6 +712,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| // Current now points to the slot we are going to push to next. | |
| PrefetchSlabMemory(scratch + current * sizeof(void*)); | |
| return true; | |
| +magic_fail: | |
| + tcmalloc_slabs = 0; | |
| overflow_label: | |
| return false; | |
| } | |
| @@ -652,9 +737,17 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| "tbz %[region_start], #%c[cached_slabs_bit], %l[overflow_label]\n" | |
| "and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n" | |
| + "ldapr %w[scratch], [%[region_start]]\n" | |
| + "tbz %w[scratch], #%c[slab_valid_bit], %l[magic_fail]\n" | |
| #else | |
| "subs %[region_start], %[region_start], %[cached_slabs_mask]\n" | |
| "b.ls 5f\n" | |
| + "ldapr %w[scratch], [%[region_start]]\n" | |
| + "tbnz %w[scratch], #%c[slab_valid_bit], 7f\n" | |
| + "str xzr, %[rseq_slabs_addr]\n" | |
| + "cmp %w[scratch], %w[scratch]\n" | |
| + "b 5f\n" | |
| + "7:\n" | |
| #endif | |
| // end_ptr = &(slab_headers[0]->end) | |
| "add %[end_ptr], %[region_start], #2\n" | |
| @@ -689,7 +782,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| : TCMALLOC_RSEQ_CLOBBER, "memory" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| , "cc" | |
| - : overflow_label | |
| + : overflow_label, magic_fail | |
| #endif | |
| ); | |
| #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| @@ -698,6 +791,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push( | |
| } | |
| #endif | |
| return true; | |
| +magic_fail: | |
| + tcmalloc_slabs = 0; | |
| overflow_label: | |
| return false; | |
| } | |
| @@ -762,9 +857,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| "btrq $%c[cached_slabs_bit], %[scratch]\n" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| "jnc %l[underflow_path]\n" | |
| + "testb $%c[slab_valid_mask], (%[scratch])\n" | |
| + "jz %l[magic_fail]\n" | |
| #else | |
| "cmc\n" | |
| "jc 5f\n" | |
| + "testb $%c[slab_valid_mask], (%[scratch])\n" | |
| + "jnz 7f\n" | |
| + "movq $0, %[rseq_slabs_addr]\n" | |
| + "stc\n" | |
| + "jmp 5f\n" | |
| + "7:\n" | |
| #endif | |
| // current = scratch->header[size_class].current; | |
| "movzwq (%[scratch], %[size_class], 4), %[current]\n" | |
| @@ -797,7 +900,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| [size_class] "r"(size_class) | |
| : "cc", "memory" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| - : underflow_path | |
| + : underflow_path, magic_fail | |
| #endif | |
| ); | |
| #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| @@ -814,6 +917,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| PrefetchSlabMemory(scratch + (current - 2) * sizeof(void*)); | |
| PrefetchNextObject(next); | |
| return AssumeNotNull(result); | |
| +magic_fail: | |
| + tcmalloc_slabs = 0; | |
| underflow_path: | |
| return nullptr; | |
| } | |
| @@ -845,6 +950,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| "b.eq 5f\n" | |
| #endif | |
| "and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n" | |
| +#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| + "ldapr %w[previous], [%[region_start]]\n" | |
| + "tbz %w[previous], #%c[slab_valid_bit], %l[magic_fail]\n" | |
| +#else | |
| + "ldapr %w[previous], [%[region_start]]\n" | |
| + "tbnz %w[previous], #%c[slab_valid_bit], 7f\n" | |
| + "str xzr, %[rseq_slabs_addr]\n" | |
| + "cmp %w[previous], %w[previous]\n" | |
| + "b 5f\n" | |
| + "7:\n" | |
| +#endif | |
| // scratch = slab_headers[size_class]->current (current index) | |
| "ldrh %w[scratch], [%[region_start], %[size_class_lsl2]]\n" | |
| // scratch-- | |
| @@ -885,7 +1001,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| : TCMALLOC_RSEQ_CLOBBER, "memory" | |
| #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| , "cc" | |
| - : underflow_path | |
| + : underflow_path, magic_fail | |
| #endif | |
| ); | |
| #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT | |
| @@ -896,6 +1012,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) { | |
| TSANAcquire(result); | |
| PrefetchNextObject(prefetch); | |
| return AssumeNotNull(result); | |
| +magic_fail: | |
| + tcmalloc_slabs = 0; | |
| underflow_path: | |
| return nullptr; | |
| } | |
| @@ -912,6 +1030,9 @@ inline size_t TcmallocSlab::Grow( | |
| absl::FunctionRef<size_t(uint8_t)> max_capacity) { | |
| const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed); | |
| const size_t max_cap = max_capacity(ToUint8(shift)); | |
| + const uintptr_t expected_slabs = | |
| + reinterpret_cast<uintptr_t>(CpuMemoryStart(slabs, shift, cpu)) | | |
| + TCMALLOC_CACHED_SLABS_MASK; | |
| auto* hdrp = GetHeader(slabs, shift, cpu, size_class); | |
| Header hdr = LoadHeader(hdrp); | |
| uint16_t begin = begins_[size_class].load(std::memory_order_relaxed); | |
| @@ -919,9 +1040,19 @@ inline size_t TcmallocSlab::Grow( | |
| if (have <= 0) { | |
| return 0; | |
| } | |
| + // Grow preloads the header outside the critical section, so the commit must | |
| + // validate both the cached slab identity and that the header value we read is | |
| + // still current. Otherwise a stop/start cycle can restore the valid bit and | |
| + // let us overwrite a remote GrowOtherCache/ShrinkOtherCache update with stale | |
| + // data. | |
| + const auto old_hdr = hdr; | |
| uint16_t n = std::min<uint16_t>(len, have); | |
| hdr.end += n; | |
| - return StoreCurrentCpu(hdrp, hdr) ? n : 0; | |
| + return CompareAndSwapCurrentCpuChecked(hdrp, absl::bit_cast<int32_t>(old_hdr), | |
| + absl::bit_cast<int32_t>(hdr), | |
| + expected_slabs) | |
| + ? n | |
| + : 0; | |
| } | |
| inline std::pair<int, bool> TcmallocSlab::CacheCpuSlab() { | |
| diff --git a/tcmalloc/testing/background_test.cc b/tcmalloc/testing/background_test.cc | |
| index a6b6e008126..91a16ae1e43 100644 | |
| --- a/tcmalloc/testing/background_test.cc | |
| +++ b/tcmalloc/testing/background_test.cc | |
| @@ -18,7 +18,7 @@ | |
| #include <atomic> | |
| #include <thread> | |
| -#include "gtest/gtest.h" | |
| +#include "mongo/unittest/unittest.h" | |
| #include "absl/time/clock.h" | |
| #include "absl/time/time.h" | |
| #include "tcmalloc/malloc_extension.h" | |
| @@ -30,13 +30,12 @@ namespace tcmalloc { | |
| namespace { | |
| TEST(BackgroundTest, Defaults) { | |
| - EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled()); | |
| - EXPECT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(), | |
| + ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled()); | |
| + ASSERT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(), | |
| absl::Seconds(1)); | |
| } | |
| TEST(BackgroundTest, Stress) { | |
| - // Process background actions by setting a custom sleep interval. | |
| struct ProcessActions { | |
| static void Go() { | |
| constexpr absl::Duration kSleepTime = absl::Milliseconds(10); | |
| @@ -45,8 +44,7 @@ TEST(BackgroundTest, Stress) { | |
| } | |
| }; | |
| - // Make sure that background acions are indeed enabled. | |
| - EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled()); | |
| + ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled()); | |
| std::thread background(ProcessActions::Go); | |
| @@ -66,9 +64,4 @@ TEST(BackgroundTest, Stress) { | |
| } | |
| } // namespace | |
| -} // namespace tcmalloc | |
| - | |
| -int main(int argc, char** argv) { | |
| - testing::InitGoogleTest(&argc, argv); | |
| - return RUN_ALL_TESTS(); | |
| -} | |
| +} // namespace tcmalloc | |
| \ No newline at end of file | |
| diff --git a/tcmalloc/testing/double_alloc_test.cc b/tcmalloc/testing/double_alloc_test.cc | |
| new file mode 100644 | |
| index 00000000000..e00536dfbb2 | |
| --- /dev/null | |
| +++ b/tcmalloc/testing/double_alloc_test.cc | |
| @@ -0,0 +1,191 @@ | |
| +// Copyright 2025 The TCMalloc Authors | |
| +// | |
| +// Licensed under the Apache License, Version 2.0 (the "License"); | |
| +// you may not use this file except in compliance with the License. | |
| +// You may obtain a copy of the License at | |
| +// | |
| +// https://www.apache.org/licenses/LICENSE-2.0 | |
| +// | |
| +// Unless required by applicable law or agreed to in writing, software | |
| +// distributed under the License is distributed on an "AS IS" BASIS, | |
| +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| +// See the License for the specific language governing permissions and | |
| +// limitations under the License. | |
| + | |
| +// Stress test to detect double-allocation caused by the Linux 6.19 rseq bug. | |
| +// | |
| +// On Linux 6.19, membarrier RSEQ IPI no longer writes cpu_id_start. | |
| +// This breaks tcmalloc's StopCpu protocol: ShrinkOtherCache/DrainCpu can | |
| +// read slab objects concurrently with a Pop on the same CPU, giving two | |
| +// callers the same pointer (silent heap corruption). | |
| +// | |
| +// Detection: each allocation is stamped with a per-thread canary. If another | |
| +// thread receives the same pointer, it overwrites the canary. The original | |
| +// owner detects this on its next verification pass. | |
| + | |
| +#include <sched.h> | |
| +#include <stdint.h> | |
| +#include <string.h> | |
| +#include <unistd.h> | |
| + | |
| +#include <atomic> | |
| +#include <cstdio> | |
| +#include <thread> | |
| +#include <vector> | |
| + | |
| +#include "mongo/unittest/unittest.h" | |
| +#include "absl/time/clock.h" | |
| +#include "absl/time/time.h" | |
| +#include "tcmalloc/malloc_extension.h" | |
| + | |
| +namespace tcmalloc { | |
| +namespace { | |
| + | |
| +constexpr int kNumThreads = 16; | |
| +constexpr int kMaxLivePerThread = 800; | |
| +constexpr absl::Duration kTestDuration = absl::Seconds(30); | |
| + | |
| +constexpr size_t kAllocSizes[] = {16, 32, 48, 64, 80, 128, 256}; | |
| +constexpr int kNumSizes = sizeof(kAllocSizes) / sizeof(kAllocSizes[0]); | |
| + | |
| +struct Alloc { | |
| + void* ptr; | |
| + size_t size; | |
| + uint64_t canary; | |
| +}; | |
| + | |
| +static uint64_t MakeCanary(int tid, uint64_t counter) { | |
| + return (static_cast<uint64_t>(tid + 1) << 48) | (counter & 0xFFFFFFFFFFFFULL); | |
| +} | |
| + | |
| +static int CanaryTid(uint64_t canary) { | |
| + return static_cast<int>(canary >> 48) - 1; | |
| +} | |
| + | |
| +static void StampAlloc(void* ptr, size_t size, uint64_t canary) { | |
| + auto* p = static_cast<volatile uint64_t*>(ptr); | |
| + size_t n = size / sizeof(uint64_t); | |
| + for (size_t i = 0; i < n; ++i) { | |
| + p[i] = canary; | |
| + } | |
| +} | |
| + | |
| +static bool VerifyAlloc(const Alloc& a) { | |
| + auto* p = static_cast<volatile uint64_t*>(a.ptr); | |
| + return p[0] == a.canary; | |
| +} | |
| + | |
| +TEST(DoubleAllocTest, DetectCorruption) { | |
| + MallocExtension::SetBackgroundProcessSleepInterval(absl::Milliseconds(1)); | |
| + | |
| + std::thread background([] { | |
| + MallocExtension::ProcessBackgroundActions(); | |
| + }); | |
| + | |
| + std::atomic<bool> stop{false}; | |
| + std::atomic<int> canary_corruptions{0}; | |
| + std::atomic<uint64_t> total_allocs{0}; | |
| + | |
| + std::vector<std::thread> threads; | |
| + for (int tid = 0; tid < kNumThreads; ++tid) { | |
| + threads.emplace_back([&, tid] { | |
| + std::vector<Alloc> live; | |
| + live.reserve(kMaxLivePerThread + 128); | |
| + uint64_t counter = 0; | |
| + uint32_t rng = tid * 2654435761u + 1; | |
| + | |
| + while (!stop.load(std::memory_order_relaxed)) { | |
| + rng = rng * 1103515245 + 12345; | |
| + size_t alloc_size = kAllocSizes[rng % kNumSizes]; | |
| + | |
| + for (int i = 0; i < 64 && static_cast<int>(live.size()) < kMaxLivePerThread; ++i) { | |
| + void* p = ::operator new(alloc_size); | |
| + uint64_t canary = MakeCanary(tid, ++counter); | |
| + StampAlloc(p, alloc_size, canary); | |
| + live.push_back({p, alloc_size, canary}); | |
| + total_allocs.fetch_add(1, std::memory_order_relaxed); | |
| + } | |
| + | |
| + for (size_t i = 0; i < live.size(); ++i) { | |
| + if (!VerifyAlloc(live[i])) { | |
| + auto* p = static_cast<volatile uint64_t*>(live[i].ptr); | |
| + uint64_t found = p[0]; | |
| + int found_tid = CanaryTid(found); | |
| + int expected_tid = CanaryTid(live[i].canary); | |
| + int corruptions = | |
| + canary_corruptions.fetch_add(1, std::memory_order_relaxed) + 1; | |
| + fprintf(stderr, | |
| + "*** DOUBLE ALLOCATION DETECTED (#%d) ***\n" | |
| + " ptr=%p size=%zu\n" | |
| + " expected canary=0x%016lx (tid=%d)\n" | |
| + " found canary=0x%016lx (tid=%d)\n", | |
| + corruptions, live[i].ptr, live[i].size, | |
| + (unsigned long)live[i].canary, expected_tid, | |
| + (unsigned long)found, found_tid); | |
| + live[i].ptr = nullptr; | |
| + stop.store(true, std::memory_order_relaxed); | |
| + } | |
| + } | |
| + | |
| + size_t w = 0; | |
| + for (size_t r = 0; r < live.size(); ++r) { | |
| + if (live[r].ptr != nullptr) { | |
| + if (w != r) live[w] = live[r]; | |
| + ++w; | |
| + } | |
| + } | |
| + live.resize(w); | |
| + | |
| + rng = rng * 1103515245 + 12345; | |
| + int to_free = live.size() / 2; | |
| + for (int i = 0; i < to_free; ++i) { | |
| + auto& a = live.back(); | |
| + if (a.ptr) { | |
| + if (!VerifyAlloc(a)) { | |
| + auto* p = static_cast<volatile uint64_t*>(a.ptr); | |
| + uint64_t found = p[0]; | |
| + canary_corruptions.fetch_add(1, std::memory_order_relaxed); | |
| + fprintf(stderr, | |
| + "*** DOUBLE ALLOCATION DETECTED (at free) ***\n" | |
| + " ptr=%p expected=0x%016lx found=0x%016lx\n", | |
| + a.ptr, (unsigned long)a.canary, (unsigned long)found); | |
| + a.ptr = nullptr; | |
| + } else { | |
| + ::operator delete(a.ptr, a.size); | |
| + } | |
| + } | |
| + live.pop_back(); | |
| + } | |
| + } | |
| + | |
| + if (canary_corruptions.load(std::memory_order_relaxed) == 0) { | |
| + for (auto& a : live) { | |
| + if (a.ptr) ::operator delete(a.ptr, a.size); | |
| + } | |
| + } | |
| + }); | |
| + } | |
| + | |
| + absl::SleepFor(kTestDuration); | |
| + stop.store(true, std::memory_order_relaxed); | |
| + | |
| + for (auto& t : threads) t.join(); | |
| + | |
| + MallocExtension::SetBackgroundProcessActionsEnabled(false); | |
| + background.join(); | |
| + | |
| + uint64_t ops = total_allocs.load(); | |
| + int corruptions = canary_corruptions.load(); | |
| + fprintf(stderr, | |
| + "\n=== Results ===\n" | |
| + "Total allocations: %lu\n" | |
| + "Canary corruptions (double allocations): %d\n", | |
| + (unsigned long)ops, corruptions); | |
| + | |
| + if (corruptions > 0) { | |
| + FAIL("Double allocation detected") << ": " << corruptions << " corruptions out of " << ops << " allocations"; | |
| + } | |
| +} | |
| + | |
| +} // namespace | |
| +} // namespace tcmalloc | |
| \ No newline at end of file | |
| diff --git a/tcmalloc/testing/test_allocator_harness.h b/tcmalloc/testing/test_allocator_harness.h | |
| index e3333d9c7b7..d637b9e3419 100644 | |
| --- a/tcmalloc/testing/test_allocator_harness.h | |
| +++ b/tcmalloc/testing/test_allocator_harness.h | |
| @@ -20,7 +20,7 @@ | |
| #include <utility> | |
| #include <vector> | |
| -#include "gtest/gtest.h" | |
| +#include "mongo/unittest/unittest.h" | |
| #include "absl/base/optimization.h" | |
| #include "absl/random/random.h" | |
| #include "absl/synchronization/mutex.h" | |
| diff --git a/tcmalloc/testing/testutil.h b/tcmalloc/testing/testutil.h | |
| index 4d02ccde8b3..f9ef104e14c 100644 | |
| --- a/tcmalloc/testing/testutil.h | |
| +++ b/tcmalloc/testing/testutil.h | |
| @@ -239,7 +239,7 @@ class ScopedFakeCpuId { | |
| // modifying __rseq_abi, we can inject our own CPU ID. | |
| tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id = cpu_id; | |
| - if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) { | |
| + if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) { | |
| tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id = cpu_id; | |
| } | |
| #endif | |
| @@ -252,7 +252,7 @@ class ScopedFakeCpuId { | |
| tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id = | |
| tcmalloc_internal::subtle::percpu::kCpuIdUninitialized; | |
| - if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) { | |
| + if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) { | |
| tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id = | |
| tcmalloc_internal::subtle::percpu::kCpuIdUninitialized; | |
| } | |
| diff --git a/tcmalloc/testing/thread_manager.h b/tcmalloc/testing/thread_manager.h | |
| index 7f7acb3d1d4..0a462ce02ac 100644 | |
| --- a/tcmalloc/testing/thread_manager.h | |
| +++ b/tcmalloc/testing/thread_manager.h | |
| @@ -20,7 +20,7 @@ | |
| #include <thread> | |
| #include <vector> | |
| -#include "gtest/gtest.h" | |
| +#include "mongo/unittest/unittest.h" | |
| #include "absl/synchronization/blocking_counter.h" | |
| namespace tcmalloc { | |
| @@ -29,7 +29,7 @@ class ThreadManager { | |
| public: | |
| ThreadManager() : shutdown_(false) {} | |
| ~ThreadManager() { | |
| - EXPECT_TRUE(shutdown_.load()) << "ThreadManager not stopped"; | |
| + ASSERT_TRUE(shutdown_.load()); | |
| } | |
| // Invokes `func` repeatedly on each of `n` threads until `Stop` is called. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment