RedBeard0531 · April 22, 2026 07:31
diff --git a/tcmalloc_rseq_fixes.patch b/tcmalloc_rseq_fixes.patch
 diff --git a/tcmalloc/cpu_cache.h b/tcmalloc/cpu_cache.h
 index 3ea24065168..a460ebb5227 100644
 --- a/tcmalloc/cpu_cache.h
 +++ b/tcmalloc/cpu_cache.h
 @@ -726,12 +726,12 @@ static cpu_set_t FillActiveCpuMask() {
   }
 
 #ifdef PERCPU_USE_RSEQ
 -  const bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus();
 +  constexpr bool real_cpus = !subtle::percpu::UsingFlatVirtualCpus();
 #else
 -  const bool real_cpus = true;
 +  constexpr bool real_cpus = true;
 #endif
 
 -  if (real_cpus) {
 +  if constexpr (real_cpus) {
     return allowed_cpus;
   }
 
 diff --git a/tcmalloc/cpu_cache_test.cc b/tcmalloc/cpu_cache_test.cc
 index e90e8e6b904..adc37e4ce76 100644
 --- a/tcmalloc/cpu_cache_test.cc
 +++ b/tcmalloc/cpu_cache_test.cc
 @@ -398,9 +398,8 @@ TEST(CpuCacheTest, Metadata) {
   int allowed_cpu_id;
   const size_t kSizeClass = 2;
   const size_t num_to_move = cache.forwarder().num_objects_to_move(kSizeClass);
 -  const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus()
 -                                           ? offsetof(kernel_rseq, vcpu_id)
 -                                           : offsetof(kernel_rseq, cpu_id);
 +  constexpr size_t virtual_cpu_id_offset =
 +      subtle::percpu::VirtualCpuIdFieldOffset();
   void* ptr;
   {
     // Restrict this thread to a single core while allocating and processing the
 @@ -528,9 +527,8 @@ TEST(CpuCacheTest, CacheMissStats) {
 
   int allowed_cpu_id;
   const size_t kSizeClass = 2;
 -  const size_t virtual_cpu_id_offset = subtle::percpu::UsingFlatVirtualCpus()
 -                                           ? offsetof(kernel_rseq, vcpu_id)
 -                                           : offsetof(kernel_rseq, cpu_id);
 +  constexpr size_t virtual_cpu_id_offset =
 +      subtle::percpu::VirtualCpuIdFieldOffset();
   void* ptr;
   {
     // Restrict this thread to a single core while allocating and processing the
 diff --git a/tcmalloc/internal/percpu.cc b/tcmalloc/internal/percpu.cc
 index 2973408bec0..4e0f408c5f5 100644
 --- a/tcmalloc/internal/percpu.cc
 +++ b/tcmalloc/internal/percpu.cc
 @@ -87,10 +87,6 @@ static bool InitThreadPerCpu() {
   return false;
 }
 
 -bool UsingFlatVirtualCpus() {
 -  return false;
 -}
 -
 static void InitPerCpu() {
   TC_CHECK(NumCPUs() <= std::numeric_limits<uint16_t>::max());
 
 @@ -317,12 +313,14 @@ static void FenceInterruptCPU(int cpu) {
   SlowFence(cpu);
 }
 
 -void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) {
 +void FenceCpu(int cpu) {
   // Prevent compiler re-ordering of code below. In particular, the call to
   // GetCurrentCpu must not appear in assembly program order until after any
   // code that comes before FenceCpu in C++ program order.
   CompilerBarrier();
 
 +  constexpr size_t virtual_cpu_id_offset = VirtualCpuIdFieldOffset();
 +
   // A useful fast path: nothing needs doing at all to order us with respect
   // to our own CPU.
   if (ABSL_PREDICT_TRUE(IsFastNoInit()) &&
 @@ -330,7 +328,7 @@ void FenceCpu(int cpu, const size_t virtual_cpu_id_offset) {
     return;
   }
 
 -  if (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) {
 +  if constexpr (virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id)) {
     ASSUME(false);
 
     // With virtual CPUs, we cannot identify the true physical core we need to
 diff --git a/tcmalloc/internal/percpu.h b/tcmalloc/internal/percpu.h
 index b592d9b6ea8..d9f83bb4dc7 100644
 --- a/tcmalloc/internal/percpu.h
 +++ b/tcmalloc/internal/percpu.h
 @@ -25,6 +25,14 @@
 // Offset from __rseq_abi to the cached slabs address.
 #define TCMALLOC_RSEQ_SLABS_OFFSET -4
 
 +// Bit 0 of header[0] (size class 0) marks a slab as valid/active.
 +// After MADV_DONTNEED zeroes old slab pages, the bit stays clear,
 +// causing the in-CS check to bail out. SetStopFlag and InitCpuImpl keep
 +// a slab invalid with 0; ClearStopFlag publishes it by setting bit 0 once
 +// the slab is fully initialized and visible to fast paths.
 +#define TCMALLOC_SLAB_VALID_BIT 0
 +#define TCMALLOC_SLAB_VALID_MASK 0x1
 +
 // The bit denotes that tcmalloc_rseq.slabs contains valid slabs offset.
 #define TCMALLOC_CACHED_SLABS_BIT 63
 #define TCMALLOC_CACHED_SLABS_MASK_SHIFT (1ul << TCMALLOC_CACHED_SLABS_BIT)
 @@ -224,7 +232,17 @@ size_t TcmallocSlab_Internal_PopBatch(size_t size_class, void** batch,
 // virtue of C linkage) in the supported case.
 
 // Return whether we are using flat virtual CPUs.
 -bool UsingFlatVirtualCpus();
 +inline constexpr bool UsingFlatVirtualCpus() { return false; }
 +
 +// Byte offset from &__rseq_abi to the CPU id field used for slab indexing
 +// (physical cpu_id vs flat vcpu_id), matching TcmallocSlab::virtual_cpu_id_offset_.
 +inline constexpr size_t VirtualCpuIdFieldOffset() {
 +  if constexpr (UsingFlatVirtualCpus()) {
 +    return offsetof(kernel_rseq, vcpu_id);
 +  } else {
 +    return offsetof(kernel_rseq, cpu_id);
 +  }
 +}
 
 enum class RseqVcpuMode { kNone };
 inline RseqVcpuMode GetRseqVcpuMode() { return RseqVcpuMode::kNone; }
 @@ -278,8 +296,10 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) {
     return cpu;
   }
 
 -  // Do not return a physical CPU ID when we expect a virtual CPU ID.
 -  TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id));
 +  if constexpr (UsingFlatVirtualCpus()) {
 +    // Do not return a physical CPU ID when we expect a virtual CPU ID.
 +    TC_CHECK_NE(virtual_cpu_id_offset, offsetof(kernel_rseq, vcpu_id));
 +  }
 
 #ifdef TCMALLOC_HAVE_SCHED_GETCPU
   cpu = sched_getcpu();
 @@ -290,9 +310,7 @@ inline int GetCurrentVirtualCpu(const size_t virtual_cpu_id_offset) {
 }
 
 inline int GetCurrentVirtualCpuUnsafe() {
 -  const size_t offset = UsingFlatVirtualCpus() ? offsetof(kernel_rseq, vcpu_id)
 -                                               : offsetof(kernel_rseq, cpu_id);
 -  return GetCurrentVirtualCpuUnsafe(offset);
 +  return GetCurrentVirtualCpuUnsafe(VirtualCpuIdFieldOffset());
 }
 
 bool InitFastPerCpu();
 @@ -375,7 +393,7 @@ inline void TSANReleaseBatch(void** batch, int n) {
 #endif
 }
 
 -void FenceCpu(int cpu, const size_t virtual_cpu_id_offset);
 +void FenceCpu(int cpu);
 void FenceAllCpus();
 
 }  // namespace percpu
 diff --git a/tcmalloc/internal/percpu_rseq_aarch64.S b/tcmalloc/internal/percpu_rseq_aarch64.S
 index d6a684c130f..82d4d31a83b 100644
 --- a/tcmalloc/internal/percpu_rseq_aarch64.S
 +++ b/tcmalloc/internal/percpu_rseq_aarch64.S
 @@ -115,6 +115,7 @@
 label##_trampoline:                                               \
   CFI(.cfi_startproc);                                            \
   BTI_C;                                                          \
 +  str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET];                      \
   b .L##label##_abort;                                            \
   CFI(.cfi_endproc);                                              \
   .size label##_trampoline, . - label##_trampoline;               \
 @@ -231,6 +232,8 @@ TcmallocSlab_Internal_PushBatch:
   FETCH_SLABS(x8)
   tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PushBatch_no_capacity
   and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK
 +  ldapr w15, [x8]
 +  tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PushBatch_magic_fail
   add x15, x8, x0, LSL #2    /* r15 = hdr */
   ldrh w9, [x15]             /* r9 = current */
   ldrh w10, [x15, #2]        /* r10 = end */
 @@ -261,6 +264,8 @@ TcmallocSlab_Internal_PushBatch:
 .LTcmallocSlab_Internal_PushBatch_commit:
   mov x0, x10
   ret
 +.LTcmallocSlab_Internal_PushBatch_magic_fail:
 +  str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]
 .LTcmallocSlab_Internal_PushBatch_no_capacity:
   mov x0, #0
   ret
 @@ -303,6 +308,8 @@ TcmallocSlab_Internal_PopBatch:
   FETCH_SLABS(x8)
   tbz x8, #TCMALLOC_CACHED_SLABS_BIT, .LTcmallocSlab_Internal_PopBatch_no_items
   and x8, x8, #~TCMALLOC_CACHED_SLABS_MASK
 +  ldapr w15, [x8]
 +  tbz w15, #TCMALLOC_SLAB_VALID_BIT, .LTcmallocSlab_Internal_PopBatch_magic_fail
   add x15, x8, x0, LSL #2
   ldrh w9, [x15]             /* current */
   ldrh w10, [x3]             /* begin */
 @@ -333,6 +340,8 @@ TcmallocSlab_Internal_PopBatch:
 .LTcmallocSlab_Internal_PopBatch_commit:
   mov x0, x11
   ret
 +.LTcmallocSlab_Internal_PopBatch_magic_fail:
 +  str xzr, [x5, TCMALLOC_RSEQ_SLABS_OFFSET]
 .LTcmallocSlab_Internal_PopBatch_no_items:
   mov x0, #0
   ret
 diff --git a/tcmalloc/internal/percpu_rseq_x86_64.S b/tcmalloc/internal/percpu_rseq_x86_64.S
 index 797fec5572f..f3724663055 100644
 --- a/tcmalloc/internal/percpu_rseq_x86_64.S
 +++ b/tcmalloc/internal/percpu_rseq_x86_64.S
 @@ -98,6 +98,7 @@
   .type  label##_trampoline, @function;                           \
 label##_trampoline:                                               \
   CFI(.cfi_startproc);                                            \
 +  CLEAR_SLABS_CACHE;                                              \
   jmp .L##label##_abort;                                          \
   CFI(.cfi_endproc);                                              \
   .size label##_trampoline, . - label##_trampoline;
 @@ -137,6 +138,8 @@ label##_trampoline:                                               \
   movzwl %fs:__rseq_abi@TPOFF(offset), dest;
 #define FETCH_SLABS(dest)         \
   movq %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET, dest
 +#define CLEAR_SLABS_CACHE \
 +  movq $0, %fs:__rseq_abi@TPOFF + TCMALLOC_RSEQ_SLABS_OFFSET;
 #define START_RSEQ(src)                        \
   .L##src##_abort:                             \
   leaq __rseq_cs_##src(%rip), %rax;            \
 @@ -154,6 +157,9 @@ label##_trampoline:                                               \
  */
 #define FETCH_CPU(dest, offset) movzwl (%rax, offset), dest;
 #define FETCH_SLABS(dest) movq TCMALLOC_RSEQ_SLABS_OFFSET(%rax), dest
 +#define CLEAR_SLABS_CACHE                           \
 +  call tcmalloc_internal_tls_fetch_pic@PLT;          \
 +  movq $0, TCMALLOC_RSEQ_SLABS_OFFSET(%rax);
 #define START_RSEQ(src)                     \
   .L##src##_abort:                          \
   call tcmalloc_internal_tls_fetch_pic@PLT; \
 @@ -243,6 +249,8 @@ TcmallocSlab_Internal_PushBatch:
   FETCH_SLABS(%r8);
   btrq $TCMALLOC_CACHED_SLABS_BIT, %r8;
   jnc .LTcmallocSlab_Internal_PushBatch_full;
 +  testb $TCMALLOC_SLAB_VALID_MASK, (%r8);
 +  jz .LTcmallocSlab_Internal_PushBatch_magic_fail;
   movzwq (%r8, %rdi, 4), %r9; /* current */
   movzwq 2(%r8, %rdi, 4), %r10; /* end */
   cmpq %r10, %r9;
 @@ -264,6 +272,8 @@ TcmallocSlab_Internal_PushBatch:
   movq %rdx, %rax;
   subq %r11, %rax;
   ret;
 +.LTcmallocSlab_Internal_PushBatch_magic_fail:
 +  CLEAR_SLABS_CACHE;
 .LTcmallocSlab_Internal_PushBatch_full:
   xor %rax, %rax;
   ret;
 @@ -305,6 +315,8 @@ TcmallocSlab_Internal_PopBatch:
   xorq %rax, %rax;
   btrq $TCMALLOC_CACHED_SLABS_BIT, %r8;
   jnc .LTcmallocSlab_Internal_PopBatch_commit;
 +  testb $TCMALLOC_SLAB_VALID_MASK, (%r8);
 +  jz .LTcmallocSlab_Internal_PopBatch_magic_fail;
   movzwq (%r8, %rdi, 4), %r9; /* current */
   movzwq (%rcx), %r10; /* begin */
   cmp %r10, %r9;
 @@ -323,6 +335,10 @@ TcmallocSlab_Internal_PopBatch:
   movw %r9w, (%r8, %rdi, 4);
 .LTcmallocSlab_Internal_PopBatch_commit:
   ret;
 +.LTcmallocSlab_Internal_PopBatch_magic_fail:
 +  CLEAR_SLABS_CACHE;
 +  xor %rax, %rax; /* rax is clobbered by CLEAR_SLABS_CACHE on the PIC path. */
 +  ret;
   CFI(.cfi_endproc)
 ENCODE_SIZE(TcmallocSlab_Internal_PopBatch)
 DEFINE_UPSTREAM_CS(TcmallocSlab_Internal_PopBatch)
 diff --git a/tcmalloc/internal/percpu_tcmalloc.cc b/tcmalloc/internal/percpu_tcmalloc.cc
 index a68e39d0a4f..3014b97b6c8 100644
 --- a/tcmalloc/internal/percpu_tcmalloc.cc
 +++ b/tcmalloc/internal/percpu_tcmalloc.cc
 @@ -17,8 +17,8 @@
 #include <algorithm>
 #include <atomic>
 #include <cstddef>
 -#include <cstdint>
 #include <limits>
 +#include <cstdint>
 #include <new>
 #include <utility>
 
 @@ -43,9 +43,6 @@ void TcmallocSlab::Init(
     absl::FunctionRef<size_t(size_t)> capacity, Shift shift) {
   TC_ASSERT(num_classes_ == 0 && num_classes != 0);
   num_classes_ = num_classes;
 -  if (UsingFlatVirtualCpus()) {
 -    virtual_cpu_id_offset_ = offsetof(kernel_rseq, vcpu_id);
 -  }
   stopped_ = new (alloc(sizeof(stopped_[0]) * NumCPUs(),
                         std::align_val_t{ABSL_CACHELINE_SIZE}))
       std::atomic<bool>[NumCPUs()];
 @@ -100,8 +97,11 @@ void TcmallocSlab::InitCpuImpl(void* slabs, Shift shift, int cpu,
   TC_CHECK_LE((1 << ToUint8(shift)), (1 << 16) * sizeof(void*));
 
   // Initialize prefetch target and compute the offsets for the
 -  // boundaries of each size class' cache.
 +  // boundaries of each size class' cache. Keep the slab invalid until the
 +  // caller explicitly publishes it with ClearStopFlag().
   void* curr_slab = CpuMemoryStart(slabs, shift, cpu);
 +  static_cast<std::atomic<int32_t>*>(curr_slab)->store(
 +      0, std::memory_order_relaxed);
   void** elems = reinterpret_cast<void**>(
       (reinterpret_cast<uintptr_t>(GetHeader(slabs, shift, cpu, num_classes_)) +
        sizeof(void*) - 1) &
 @@ -213,7 +213,9 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
                                absl::FunctionRef<bool(size_t)> populated,
                                DrainHandler drain_handler) -> ResizeSlabsInfo {
   // Phase 1: Stop all CPUs and initialize any CPUs in the new slab that have
 -  // already been populated in the old slab.
 +  // already been populated in the old slab. Keep the new slab invalid until
 +  // phase 4 so a stale cached pointer cannot mistake a reused slab address for
 +  // a published one during A -> B -> A reuse.
   const auto [old_slabs, old_shift] =
       GetSlabsAndShift(std::memory_order_relaxed);
   TC_ASSERT_NE(new_shift, old_shift);
 @@ -221,6 +223,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
   for (size_t cpu = 0; cpu < num_cpus; ++cpu) {
     TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed));
     stopped_[cpu].store(true, std::memory_order_relaxed);
 +    SetStopFlag(cpu);
     if (populated(cpu)) {
       InitCpuImpl(new_slabs, new_shift, cpu, capacity);
     }
 @@ -238,6 +241,7 @@ auto TcmallocSlab::ResizeSlabs(Shift new_shift, void* new_slabs,
 
   // Phase 4: Re-start all CPUs.
   for (size_t cpu = 0; cpu < num_cpus; ++cpu) {
 +    ClearStopFlag(cpu);
     stopped_[cpu].store(false, std::memory_order_release);
   }
 
 @@ -308,16 +312,34 @@ void TcmallocSlab::Drain(int cpu, DrainHandler drain_handler) {
   DrainCpu(slabs, shift, cpu, drain_handler);
 }
 
 +void TcmallocSlab::SetStopFlag(int cpu) {
 +  const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
 +  auto* flag = static_cast<std::atomic<int32_t>*>(
 +      CpuMemoryStart(slabs, shift, cpu));
 +  flag->store(0, std::memory_order_relaxed);
 +}
 +
 +void TcmallocSlab::ClearStopFlag(int cpu) {
 +  const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
 +  auto* flag = static_cast<std::atomic<int32_t>*>(
 +      CpuMemoryStart(slabs, shift, cpu));
 +  // Publish completed remote slab updates before allowing fast paths that
 +  // still have a cached slab pointer to proceed based on header[0].
 +  flag->store(TCMALLOC_SLAB_VALID_MASK, std::memory_order_release);
 +}
 +
 void TcmallocSlab::StopCpu(int cpu) {
   TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu);
   TC_CHECK(!stopped_[cpu].load(std::memory_order_relaxed));
   stopped_[cpu].store(true, std::memory_order_relaxed);
 -  FenceCpu(cpu, virtual_cpu_id_offset_);
 +  SetStopFlag(cpu);
 +  FenceCpu(cpu);
 }
 
 void TcmallocSlab::StartCpu(int cpu) {
   TC_ASSERT(cpu >= 0 && cpu < NumCPUs(), "cpu=%d", cpu);
   TC_ASSERT(stopped_[cpu].load(std::memory_order_relaxed));
 +  ClearStopFlag(cpu);
   stopped_[cpu].store(false, std::memory_order_release);
 }
 
 diff --git a/tcmalloc/internal/percpu_tcmalloc.h b/tcmalloc/internal/percpu_tcmalloc.h
 index b5cd584a7bd..bbc74af712d 100644
 --- a/tcmalloc/internal/percpu_tcmalloc.h
 +++ b/tcmalloc/internal/percpu_tcmalloc.h
 @@ -41,7 +41,9 @@
 #include "tcmalloc/internal/prefetch.h"
 #include "tcmalloc/internal/sysinfo.h"
 
 -#if __clang_major__ >= 11
 +// GCC supports asm goto, but has at least gcc11 has a codegen bug on x86_64.
 +#if (defined(__GNUC__) && !defined(__clang__) && !defined(__x86_64__)) || \
 +    (defined(__clang__) && __clang_major__ >= 11)
 #define TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT 1
 #endif
 
 @@ -227,6 +229,8 @@ class TcmallocSlab {
   // synchronization protocol.
   void StopCpu(int cpu);
   void StartCpu(int cpu);
 +  void SetStopFlag(int cpu);
 +  void ClearStopFlag(int cpu);
 
   // Grows the cpu/size_class slab's capacity to no greater than
   // min(capacity+len, max_capacity(<shift>)) and returns the increment
 @@ -355,7 +359,7 @@ class TcmallocSlab {
   // so that we can atomically update both with a single store.
   std::atomic<SlabsAndShift> slabs_and_shift_{};
   // This is in units of bytes.
 -  size_t virtual_cpu_id_offset_ = offsetof(kernel_rseq, cpu_id);
 +  static constexpr size_t virtual_cpu_id_offset_ = VirtualCpuIdFieldOffset();
   // Remote Cpu operation (Resize/Drain/Grow/Shrink) is running so any local
   // operations (Push/Pop) should fail.
   std::atomic<bool>* stopped_ = nullptr;
 @@ -399,6 +403,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
 #if defined(__x86_64__)
 #define TCMALLOC_RSEQ_RELOC_TYPE "R_X86_64_NONE"
 #define TCMALLOC_RSEQ_JUMP "jmp"
 +#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "movq $0, %[rseq_slabs_addr]\n"
 #if !defined(__PIC__) && !defined(__PIE__)
 #define TCMALLOC_RSEQ_SET_CS(name) \
   "movq $__rseq_cs_" #name "_%=, %[rseq_cs_addr]\n"
 @@ -430,6 +435,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
 #define TCMALLOC_RSEQ_CLOBBER "x16", "x17"
 #define TCMALLOC_RSEQ_RELOC_TYPE "R_AARCH64_NONE"
 #define TCMALLOC_RSEQ_JUMP "b"
 +#define TCMALLOC_RSEQ_CLEAR_SLABS_CACHE "str xzr, %[rseq_slabs_addr]\n"
 #define TCMALLOC_RSEQ_SET_CS(name)                     \
   TCMALLOC_RSEQ_TRAMPLINE_SMASH                        \
   "adrp %[scratch], __rseq_cs_" #name                  \
 @@ -490,7 +496,7 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
   "_trampoline_%=,@function\n"                                                \
   "" #name                                                                    \
   "_trampoline_%=:\n"                                                         \
 -  "2:\n" TCMALLOC_RSEQ_JUMP                                                   \
 +  "2:\n" TCMALLOC_RSEQ_CLEAR_SLABS_CACHE TCMALLOC_RSEQ_JUMP                   \
   " 3f\n"                                                                     \
   ".size " #name "_trampoline_%=, . - " #name                                 \
   "_trampoline_%=\n"                                                          \
 @@ -508,7 +514,9 @@ inline size_t TcmallocSlab::Capacity(int cpu, size_t size_class) const {
                                               is no cost to passing unused   \
                                               consts. */                     \
       [cached_slabs_bit] "n"(TCMALLOC_CACHED_SLABS_BIT),                     \
 -      [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK)
 +      [cached_slabs_mask_neg] "n"(~TCMALLOC_CACHED_SLABS_MASK),              \
 +      [slab_valid_bit] "n"(TCMALLOC_SLAB_VALID_BIT),                         \
 +      [slab_valid_mask] "n"(TCMALLOC_SLAB_VALID_MASK)
 
 // Store v to p (*p = v) if the current thread wasn't rescheduled
 // (still has the slab pointer cached). Otherwise returns false.
 @@ -566,6 +574,73 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool StoreCurrentCpu(volatile void* p,
   return scratch;
 }
 
 +// Store new_v to p if the current thread still has the expected cached slab,
 +// the current slab is still active, and *p still matches old_v. Otherwise
 +// returns false.
 +inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool CompareAndSwapCurrentCpuChecked(
 +    std::atomic<int32_t>* p, int32_t old_v, int32_t new_v,
 +    uintptr_t expected_slabs) {
 +  uintptr_t scratch = 0;
 +#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__x86_64__)
 +  uintptr_t slab;
 +  uint32_t observed;
 +  asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked)
 +          R"(
 +      xorq %[scratch], %[scratch]
 +      movq %[rseq_slabs_addr], %[slab]
 +      cmpq %[expected_slabs], %[slab]
 +      jne 7f
 +      btrq $%c[cached_slabs_bit], %[slab]
 +      jnc 5f
 +      testb $%c[slab_valid_mask], (%[slab])
 +      jz 7f
 +      movl %[p], %[observed]
 +      cmpl %[old_v], %[observed]
 +      jne 5f
 +      movl $1, %k[scratch]
 +      movl %[new_v], %[p]
 +      jmp 5f
 +      7:
 +      movq $0, %[rseq_slabs_addr]
 +      5 :)"
 +      : [scratch] "=&r"(scratch), [slab] "=&r"(slab),
 +        [observed] "=&r"(observed)
 +      : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs),
 +        [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v),
 +        [new_v] "r"(new_v)
 +      : "cc", "memory");
 +#elif TCMALLOC_INTERNAL_PERCPU_USE_RSEQ && defined(__aarch64__)
 +  uintptr_t slab;
 +  uint32_t observed;
 +  asm(TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_StoreCurrentCpuChecked)
 +          R"(
 +        mov %[scratch], #0
 +        ldr %[slab], %[rseq_slabs_addr]
 +        cmp %[slab], %[expected_slabs]
 +        b.ne 7f
 +        tbz %[slab], #%c[cached_slabs_bit], 5f
 +        and %[slab], %[slab], #%c[cached_slabs_mask_neg]
 +        ldapr %w[observed], [%[slab]]
 +        tbz %w[observed], #%c[slab_valid_bit], 7f
 +        ldr %w[observed], %[p]
 +        cmp %w[observed], %w[old_v]
 +        b.ne 5f
 +        mov %[scratch], #1
 +        str %w[new_v], %[p]
 +        b 5f
 +        7:
 +        str xzr, %[rseq_slabs_addr]
 +        5 :)"
 +      : [scratch] "=&r"(scratch), [slab] "=&r"(slab),
 +        [observed] "=&r"(observed)
 +      : TCMALLOC_RSEQ_INPUTS, [expected_slabs] "r"(expected_slabs),
 +        [p] "m"(*reinterpret_cast<volatile int32_t*>(p)), [old_v] "r"(old_v),
 +        [new_v] "r"(new_v)
 +      : TCMALLOC_RSEQ_CLOBBER, "cc", "memory");
 +#endif
 +  return scratch;
 +}
 +
 // Prefetch slabs memory for the case of repeated pushes/pops.
 // Note: this prefetch slows down micro-benchmarks, but provides ~0.1-0.5%
 // speedup for larger real applications.
 @@ -591,8 +666,16 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
       "btrq $%c[cached_slabs_bit], %[scratch]\n"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
       "jnc %l[overflow_label]\n"
 +      "testb $%c[slab_valid_mask], (%[scratch])\n"
 +      "jz %l[magic_fail]\n"
 #else
 -      "jae 5f\n"  // ae==c
 +      "jae 5f\n"
 +      "testb $%c[slab_valid_mask], (%[scratch])\n"
 +      "jnz 7f\n"
 +      "movq $0, %[rseq_slabs_addr]\n"
 +      "clc\n"
 +      "jmp 5f\n"
 +      "7:\n"
 #endif
       // current = slabs->current;
       "movzwq (%[scratch], %[size_class], 4), %[current]\n"
 @@ -618,7 +701,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
       : TCMALLOC_RSEQ_INPUTS, [size_class] "r"(size_class), [item] "r"(item)
       : "cc", "memory"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 -      : overflow_label
 +      : overflow_label, magic_fail
 #endif
   );
 #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 @@ -629,6 +712,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
   // Current now points to the slot we are going to push to next.
   PrefetchSlabMemory(scratch + current * sizeof(void*));
   return true;
 +magic_fail:
 +  tcmalloc_slabs = 0;
 overflow_label:
   return false;
 }
 @@ -652,9 +737,17 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
       "tbz %[region_start], #%c[cached_slabs_bit], %l[overflow_label]\n"
       "and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n"
 +      "ldapr %w[scratch], [%[region_start]]\n"
 +      "tbz %w[scratch], #%c[slab_valid_bit], %l[magic_fail]\n"
 #else
       "subs %[region_start], %[region_start], %[cached_slabs_mask]\n"
       "b.ls 5f\n"
 +      "ldapr %w[scratch], [%[region_start]]\n"
 +      "tbnz %w[scratch], #%c[slab_valid_bit], 7f\n"
 +      "str xzr, %[rseq_slabs_addr]\n"
 +      "cmp %w[scratch], %w[scratch]\n"
 +      "b 5f\n"
 +      "7:\n"
 #endif
       // end_ptr = &(slab_headers[0]->end)
       "add %[end_ptr], %[region_start], #2\n"
 @@ -689,7 +782,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
       : TCMALLOC_RSEQ_CLOBBER, "memory"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
       , "cc"
 -      : overflow_label
 +      : overflow_label, magic_fail
 #endif
   );
 #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 @@ -698,6 +791,8 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
   }
 #endif
   return true;
 +magic_fail:
 +  tcmalloc_slabs = 0;
 overflow_label:
   return false;
 }
 @@ -762,9 +857,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
       "btrq $%c[cached_slabs_bit], %[scratch]\n"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
       "jnc %l[underflow_path]\n"
 +      "testb $%c[slab_valid_mask], (%[scratch])\n"
 +      "jz %l[magic_fail]\n"
 #else
       "cmc\n"
       "jc 5f\n"
 +      "testb $%c[slab_valid_mask], (%[scratch])\n"
 +      "jnz 7f\n"
 +      "movq $0, %[rseq_slabs_addr]\n"
 +      "stc\n"
 +      "jmp 5f\n"
 +      "7:\n"
 #endif
       // current = scratch->header[size_class].current;
       "movzwq (%[scratch], %[size_class], 4), %[current]\n"
 @@ -797,7 +900,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
         [size_class] "r"(size_class)
       : "cc", "memory"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 -      : underflow_path
 +      : underflow_path, magic_fail
 #endif
   );
 #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 @@ -814,6 +917,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
   PrefetchSlabMemory(scratch + (current - 2) * sizeof(void*));
   PrefetchNextObject(next);
   return AssumeNotNull(result);
 +magic_fail:
 +  tcmalloc_slabs = 0;
 underflow_path:
   return nullptr;
 }
 @@ -845,6 +950,17 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
       "b.eq 5f\n"
 #endif
       "and %[region_start], %[region_start], #%c[cached_slabs_mask_neg]\n"
 +#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 +      "ldapr %w[previous], [%[region_start]]\n"
 +      "tbz %w[previous], #%c[slab_valid_bit], %l[magic_fail]\n"
 +#else
 +      "ldapr %w[previous], [%[region_start]]\n"
 +      "tbnz %w[previous], #%c[slab_valid_bit], 7f\n"
 +      "str xzr, %[rseq_slabs_addr]\n"
 +      "cmp %w[previous], %w[previous]\n"
 +      "b 5f\n"
 +      "7:\n"
 +#endif
       // scratch = slab_headers[size_class]->current (current index)
       "ldrh %w[scratch], [%[region_start], %[size_class_lsl2]]\n"
       // scratch--
 @@ -885,7 +1001,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
       : TCMALLOC_RSEQ_CLOBBER, "memory"
 #if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
       , "cc"
 -      : underflow_path
 +      : underflow_path, magic_fail
 #endif
   );
 #if !TCMALLOC_INTERNAL_PERCPU_USE_RSEQ_ASM_GOTO_OUTPUT
 @@ -896,6 +1012,8 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* TcmallocSlab::Pop(size_t size_class) {
   TSANAcquire(result);
   PrefetchNextObject(prefetch);
   return AssumeNotNull(result);
 +magic_fail:
 +  tcmalloc_slabs = 0;
 underflow_path:
   return nullptr;
 }
 @@ -912,6 +1030,9 @@ inline size_t TcmallocSlab::Grow(
     absl::FunctionRef<size_t(uint8_t)> max_capacity) {
   const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
   const size_t max_cap = max_capacity(ToUint8(shift));
 +  const uintptr_t expected_slabs =
 +      reinterpret_cast<uintptr_t>(CpuMemoryStart(slabs, shift, cpu)) |
 +      TCMALLOC_CACHED_SLABS_MASK;
   auto* hdrp = GetHeader(slabs, shift, cpu, size_class);
   Header hdr = LoadHeader(hdrp);
   uint16_t begin = begins_[size_class].load(std::memory_order_relaxed);
 @@ -919,9 +1040,19 @@ inline size_t TcmallocSlab::Grow(
   if (have <= 0) {
     return 0;
   }
 +  // Grow preloads the header outside the critical section, so the commit must
 +  // validate both the cached slab identity and that the header value we read is
 +  // still current. Otherwise a stop/start cycle can restore the valid bit and
 +  // let us overwrite a remote GrowOtherCache/ShrinkOtherCache update with stale
 +  // data.
 +  const auto old_hdr = hdr;
   uint16_t n = std::min<uint16_t>(len, have);
   hdr.end += n;
 -  return StoreCurrentCpu(hdrp, hdr) ? n : 0;
 +  return CompareAndSwapCurrentCpuChecked(hdrp, absl::bit_cast<int32_t>(old_hdr),
 +                                         absl::bit_cast<int32_t>(hdr),
 +                                         expected_slabs)
 +             ? n
 +             : 0;
 }
 
 inline std::pair<int, bool> TcmallocSlab::CacheCpuSlab() {
 diff --git a/tcmalloc/testing/background_test.cc b/tcmalloc/testing/background_test.cc
 index a6b6e008126..91a16ae1e43 100644
 --- a/tcmalloc/testing/background_test.cc
 +++ b/tcmalloc/testing/background_test.cc
 @@ -18,7 +18,7 @@
 #include <atomic>
 #include <thread>
 
 -#include "gtest/gtest.h"
 +#include "mongo/unittest/unittest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tcmalloc/malloc_extension.h"
 @@ -30,13 +30,12 @@ namespace tcmalloc {
 namespace {
 
 TEST(BackgroundTest, Defaults) {
 -  EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
 -  EXPECT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(),
 +  ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
 +  ASSERT_EQ(MallocExtension::GetBackgroundProcessSleepInterval(),
             absl::Seconds(1));
 }
 
 TEST(BackgroundTest, Stress) {
 -  // Process background actions by setting a custom sleep interval.
   struct ProcessActions {
     static void Go() {
       constexpr absl::Duration kSleepTime = absl::Milliseconds(10);
 @@ -45,8 +44,7 @@ TEST(BackgroundTest, Stress) {
     }
   };
 
 -  // Make sure that background acions are indeed enabled.
 -  EXPECT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
 +  ASSERT_TRUE(MallocExtension::GetBackgroundProcessActionsEnabled());
 
   std::thread background(ProcessActions::Go);
 
 @@ -66,9 +64,4 @@ TEST(BackgroundTest, Stress) {
 }
 
 }  // namespace
 -}  // namespace tcmalloc
 -
 -int main(int argc, char** argv) {
 -  testing::InitGoogleTest(&argc, argv);
 -  return RUN_ALL_TESTS();
 -}
 +}  // namespace tcmalloc
 \ No newline at end of file
 diff --git a/tcmalloc/testing/double_alloc_test.cc b/tcmalloc/testing/double_alloc_test.cc
 new file mode 100644
 index 00000000000..e00536dfbb2
 --- /dev/null
 +++ b/tcmalloc/testing/double_alloc_test.cc
 @@ -0,0 +1,191 @@
 +// Copyright 2025 The TCMalloc Authors
 +//
 +// Licensed under the Apache License, Version 2.0 (the "License");
 +// you may not use this file except in compliance with the License.
 +// You may obtain a copy of the License at
 +//
 +//     https://www.apache.org/licenses/LICENSE-2.0
 +//
 +// Unless required by applicable law or agreed to in writing, software
 +// distributed under the License is distributed on an "AS IS" BASIS,
 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +// See the License for the specific language governing permissions and
 +// limitations under the License.
 +
 +// Stress test to detect double-allocation caused by the Linux 6.19 rseq bug.
 +//
 +// On Linux 6.19, membarrier RSEQ IPI no longer writes cpu_id_start.
 +// This breaks tcmalloc's StopCpu protocol: ShrinkOtherCache/DrainCpu can
 +// read slab objects concurrently with a Pop on the same CPU, giving two
 +// callers the same pointer (silent heap corruption).
 +//
 +// Detection: each allocation is stamped with a per-thread canary. If another
 +// thread receives the same pointer, it overwrites the canary. The original
 +// owner detects this on its next verification pass.
 +
 +#include <sched.h>
 +#include <stdint.h>
 +#include <string.h>
 +#include <unistd.h>
 +
 +#include <atomic>
 +#include <cstdio>
 +#include <thread>
 +#include <vector>
 +
 +#include "mongo/unittest/unittest.h"
 +#include "absl/time/clock.h"
 +#include "absl/time/time.h"
 +#include "tcmalloc/malloc_extension.h"
 +
 +namespace tcmalloc {
 +namespace {
 +
 +constexpr int kNumThreads = 16;
 +constexpr int kMaxLivePerThread = 800;
 +constexpr absl::Duration kTestDuration = absl::Seconds(30);
 +
 +constexpr size_t kAllocSizes[] = {16, 32, 48, 64, 80, 128, 256};
 +constexpr int kNumSizes = sizeof(kAllocSizes) / sizeof(kAllocSizes[0]);
 +
 +struct Alloc {
 +  void* ptr;
 +  size_t size;
 +  uint64_t canary;
 +};
 +
 +static uint64_t MakeCanary(int tid, uint64_t counter) {
 +  return (static_cast<uint64_t>(tid + 1) << 48) | (counter & 0xFFFFFFFFFFFFULL);
 +}
 +
 +static int CanaryTid(uint64_t canary) {
 +  return static_cast<int>(canary >> 48) - 1;
 +}
 +
 +static void StampAlloc(void* ptr, size_t size, uint64_t canary) {
 +  auto* p = static_cast<volatile uint64_t*>(ptr);
 +  size_t n = size / sizeof(uint64_t);
 +  for (size_t i = 0; i < n; ++i) {
 +    p[i] = canary;
 +  }
 +}
 +
 +static bool VerifyAlloc(const Alloc& a) {
 +  auto* p = static_cast<volatile uint64_t*>(a.ptr);
 +  return p[0] == a.canary;
 +}
 +
 +TEST(DoubleAllocTest, DetectCorruption) {
 +  MallocExtension::SetBackgroundProcessSleepInterval(absl::Milliseconds(1));
 +
 +  std::thread background([] {
 +    MallocExtension::ProcessBackgroundActions();
 +  });
 +
 +  std::atomic<bool> stop{false};
 +  std::atomic<int> canary_corruptions{0};
 +  std::atomic<uint64_t> total_allocs{0};
 +
 +  std::vector<std::thread> threads;
 +  for (int tid = 0; tid < kNumThreads; ++tid) {
 +    threads.emplace_back([&, tid] {
 +      std::vector<Alloc> live;
 +      live.reserve(kMaxLivePerThread + 128);
 +      uint64_t counter = 0;
 +      uint32_t rng = tid * 2654435761u + 1;
 +
 +      while (!stop.load(std::memory_order_relaxed)) {
 +        rng = rng * 1103515245 + 12345;
 +        size_t alloc_size = kAllocSizes[rng % kNumSizes];
 +
 +        for (int i = 0; i < 64 && static_cast<int>(live.size()) < kMaxLivePerThread; ++i) {
 +          void* p = ::operator new(alloc_size);
 +          uint64_t canary = MakeCanary(tid, ++counter);
 +          StampAlloc(p, alloc_size, canary);
 +          live.push_back({p, alloc_size, canary});
 +          total_allocs.fetch_add(1, std::memory_order_relaxed);
 +        }
 +
 +        for (size_t i = 0; i < live.size(); ++i) {
 +          if (!VerifyAlloc(live[i])) {
 +            auto* p = static_cast<volatile uint64_t*>(live[i].ptr);
 +            uint64_t found = p[0];
 +            int found_tid = CanaryTid(found);
 +            int expected_tid = CanaryTid(live[i].canary);
 +            int corruptions =
 +                canary_corruptions.fetch_add(1, std::memory_order_relaxed) + 1;
 +            fprintf(stderr,
 +                    "*** DOUBLE ALLOCATION DETECTED (#%d) ***\n"
 +                    "  ptr=%p  size=%zu\n"
 +                    "  expected canary=0x%016lx (tid=%d)\n"
 +                    "  found   canary=0x%016lx (tid=%d)\n",
 +                    corruptions, live[i].ptr, live[i].size,
 +                    (unsigned long)live[i].canary, expected_tid,
 +                    (unsigned long)found, found_tid);
 +            live[i].ptr = nullptr;
 +            stop.store(true, std::memory_order_relaxed);
 +          }
 +        }
 +
 +        size_t w = 0;
 +        for (size_t r = 0; r < live.size(); ++r) {
 +          if (live[r].ptr != nullptr) {
 +            if (w != r) live[w] = live[r];
 +            ++w;
 +          }
 +        }
 +        live.resize(w);
 +
 +        rng = rng * 1103515245 + 12345;
 +        int to_free = live.size() / 2;
 +        for (int i = 0; i < to_free; ++i) {
 +          auto& a = live.back();
 +          if (a.ptr) {
 +            if (!VerifyAlloc(a)) {
 +              auto* p = static_cast<volatile uint64_t*>(a.ptr);
 +              uint64_t found = p[0];
 +              canary_corruptions.fetch_add(1, std::memory_order_relaxed);
 +              fprintf(stderr,
 +                      "*** DOUBLE ALLOCATION DETECTED (at free) ***\n"
 +                      "  ptr=%p  expected=0x%016lx  found=0x%016lx\n",
 +                      a.ptr, (unsigned long)a.canary, (unsigned long)found);
 +              a.ptr = nullptr;
 +            } else {
 +              ::operator delete(a.ptr, a.size);
 +            }
 +          }
 +          live.pop_back();
 +        }
 +      }
 +
 +      if (canary_corruptions.load(std::memory_order_relaxed) == 0) {
 +        for (auto& a : live) {
 +          if (a.ptr) ::operator delete(a.ptr, a.size);
 +        }
 +      }
 +    });
 +  }
 +
 +  absl::SleepFor(kTestDuration);
 +  stop.store(true, std::memory_order_relaxed);
 +
 +  for (auto& t : threads) t.join();
 +
 +  MallocExtension::SetBackgroundProcessActionsEnabled(false);
 +  background.join();
 +
 +  uint64_t ops = total_allocs.load();
 +  int corruptions = canary_corruptions.load();
 +  fprintf(stderr,
 +          "\n=== Results ===\n"
 +          "Total allocations: %lu\n"
 +          "Canary corruptions (double allocations): %d\n",
 +          (unsigned long)ops, corruptions);
 +
 +  if (corruptions > 0) {
 +    FAIL("Double allocation detected") << ": " << corruptions << " corruptions out of " << ops << " allocations";
 +  }
 +}
 +
 +}  // namespace
 +}  // namespace tcmalloc
 \ No newline at end of file
 diff --git a/tcmalloc/testing/test_allocator_harness.h b/tcmalloc/testing/test_allocator_harness.h
 index e3333d9c7b7..d637b9e3419 100644
 --- a/tcmalloc/testing/test_allocator_harness.h
 +++ b/tcmalloc/testing/test_allocator_harness.h
 @@ -20,7 +20,7 @@
 #include <utility>
 #include <vector>
 
 -#include "gtest/gtest.h"
 +#include "mongo/unittest/unittest.h"
 #include "absl/base/optimization.h"
 #include "absl/random/random.h"
 #include "absl/synchronization/mutex.h"
 diff --git a/tcmalloc/testing/testutil.h b/tcmalloc/testing/testutil.h
 index 4d02ccde8b3..f9ef104e14c 100644
 --- a/tcmalloc/testing/testutil.h
 +++ b/tcmalloc/testing/testutil.h
 @@ -239,7 +239,7 @@ class ScopedFakeCpuId {
     // modifying __rseq_abi, we can inject our own CPU ID.
     tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id = cpu_id;
 
 -    if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
 +    if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
       tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id = cpu_id;
     }
 #endif
 @@ -252,7 +252,7 @@ class ScopedFakeCpuId {
     tcmalloc_internal::subtle::percpu::__rseq_abi.cpu_id =
         tcmalloc_internal::subtle::percpu::kCpuIdUninitialized;
 
 -    if (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
 +    if constexpr (tcmalloc_internal::subtle::percpu::UsingFlatVirtualCpus()) {
       tcmalloc_internal::subtle::percpu::__rseq_abi.vcpu_id =
           tcmalloc_internal::subtle::percpu::kCpuIdUninitialized;
     }
 diff --git a/tcmalloc/testing/thread_manager.h b/tcmalloc/testing/thread_manager.h
 index 7f7acb3d1d4..0a462ce02ac 100644
 --- a/tcmalloc/testing/thread_manager.h
 +++ b/tcmalloc/testing/thread_manager.h
 @@ -20,7 +20,7 @@
 #include <thread>
 #include <vector>
 
 -#include "gtest/gtest.h"
 +#include "mongo/unittest/unittest.h"
 #include "absl/synchronization/blocking_counter.h"
 
 namespace tcmalloc {
 @@ -29,7 +29,7 @@ class ThreadManager {
  public:
   ThreadManager() : shutdown_(false) {}
   ~ThreadManager() {
 -    EXPECT_TRUE(shutdown_.load()) << "ThreadManager not stopped";
 +    ASSERT_TRUE(shutdown_.load());
   }
 
   // Invokes `func` repeatedly on each of `n` threads until `Stop` is called.
No results found