base/allocator/partition_allocator/src/partition_alloc/thread_cache.cc - platform/external/cronet - Git at Google

 // Copyright 2020 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "partition_alloc/thread_cache.h"

 #include <sys/types.h>

 #include <algorithm>
 #include <atomic>
 #include <cstdint>

 #include "build/build_config.h"
 #include "partition_alloc/partition_alloc-inl.h"
 #include "partition_alloc/partition_alloc_base/component_export.h"
 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
 #include "partition_alloc/partition_alloc_base/immediate_crash.h"
 #include "partition_alloc/partition_alloc_base/time/time.h"
 #include "partition_alloc/partition_alloc_buildflags.h"
 #include "partition_alloc/partition_alloc_check.h"
 #include "partition_alloc/partition_alloc_config.h"
 #include "partition_alloc/partition_alloc_constants.h"
 #include "partition_alloc/partition_root.h"

 namespace partition_alloc {

 namespace {
 ThreadCacheRegistry g_instance;
 }  // namespace

 namespace tools {
 uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize] = {
     kNeedle1, reinterpret_cast<uintptr_t>(&g_instance),
 #if BUILDFLAG(RECORD_ALLOC_INFO)
     reinterpret_cast<uintptr_t>(&internal::g_allocs),
 #else
     0,
 #endif
     kNeedle2};
 }  // namespace tools

 namespace internal {

 PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
 thread_local ThreadCache* g_thread_cache;
 #endif

 }  // namespace internal

 namespace {
 // Since |g_thread_cache_key| is shared, make sure that no more than one
 // PartitionRoot can use it.
 static std::atomic<PartitionRoot*> g_thread_cache_root;

 #if BUILDFLAG(IS_WIN)
 void OnDllProcessDetach() {
   // Very late allocations do occur (see crbug.com/1159411#c7 for instance),
   // including during CRT teardown. This is problematic for the thread cache
   // which relies on the CRT for TLS access for instance. This cannot be
   // mitigated inside the thread cache (since getting to it requires querying
   // TLS), but the PartitionRoot associated wih the thread cache can be made to
   // not use the thread cache anymore.
   g_thread_cache_root.load(std::memory_order_relaxed)
       ->settings.with_thread_cache = false;
 }
 #endif

 static bool g_thread_cache_key_created = false;
 }  // namespace

 uint8_t ThreadCache::global_limits_[ThreadCache::kBucketCount];

 // Start with the normal size, not the maximum one.
 uint16_t ThreadCache::largest_active_bucket_index_ =
     internal::BucketIndexLookup::GetIndex(ThreadCache::kDefaultSizeThreshold);

 // static
 ThreadCacheRegistry& ThreadCacheRegistry::Instance() {
   return g_instance;
 }

 void ThreadCacheRegistry::RegisterThreadCache(ThreadCache* cache) {
   internal::ScopedGuard scoped_locker(GetLock());
   cache->next_ = nullptr;
   cache->prev_ = nullptr;

   ThreadCache* previous_head = list_head_;
   list_head_ = cache;
   cache->next_ = previous_head;
   if (previous_head) {
     previous_head->prev_ = cache;
   }
 }

 void ThreadCacheRegistry::UnregisterThreadCache(ThreadCache* cache) {
   internal::ScopedGuard scoped_locker(GetLock());
   if (cache->prev_) {
     cache->prev_->next_ = cache->next_;
   }
   if (cache->next_) {
     cache->next_->prev_ = cache->prev_;
   }
   if (cache == list_head_) {
     list_head_ = cache->next_;
   }
 }

 void ThreadCacheRegistry::DumpStats(bool my_thread_only,
                                     ThreadCacheStats* stats) {
   ThreadCache::EnsureThreadSpecificDataInitialized();
   memset(reinterpret_cast<void*>(stats), 0, sizeof(ThreadCacheStats));

   internal::ScopedGuard scoped_locker(GetLock());
   if (my_thread_only) {
     auto* tcache = ThreadCache::Get();
     if (!ThreadCache::IsValid(tcache)) {
       return;
     }
     tcache->AccumulateStats(stats);
   } else {
     ThreadCache* tcache = list_head_;
     while (tcache) {
       // Racy, as other threads are still allocating. This is not an issue,
       // since we are only interested in statistics. However, this means that
       // count is not necessarily equal to hits + misses for the various types
       // of events.
       tcache->AccumulateStats(stats);
       tcache = tcache->next_;
     }
   }
 }

 void ThreadCacheRegistry::PurgeAll() {
   auto* current_thread_tcache = ThreadCache::Get();

   // May take a while, don't hold the lock while purging.
   //
   // In most cases, the current thread is more important than other ones. For
   // instance in renderers, it is the main thread. It is also the only thread
   // that we can synchronously purge.
   //
   // The reason why we trigger the purge for this one first is that assuming
   // that all threads are allocating memory, they will start purging
   // concurrently in the loop below. This will then make them all contend with
   // the main thread for the partition lock, since it is acquired/released once
   // per bucket. By purging the main thread first, we avoid these interferences
   // for this thread at least.
   if (ThreadCache::IsValid(current_thread_tcache)) {
     current_thread_tcache->Purge();
   }

   {
     internal::ScopedGuard scoped_locker(GetLock());
     ThreadCache* tcache = list_head_;
     while (tcache) {
       PA_DCHECK(ThreadCache::IsValid(tcache));
       // Cannot purge directly, need to ask the other thread to purge "at some
       // point".
       // Note that this will not work if the other thread is sleeping forever.
       // TODO(lizeb): Handle sleeping threads.
       if (tcache != current_thread_tcache) {
         tcache->SetShouldPurge();
       }
       tcache = tcache->next_;
     }
   }
 }

 void ThreadCacheRegistry::ForcePurgeAllThreadAfterForkUnsafe() {
   internal::ScopedGuard scoped_locker(GetLock());
   ThreadCache* tcache = list_head_;
   while (tcache) {
 #if BUILDFLAG(PA_DCHECK_IS_ON)
     // Before fork(), locks are acquired in the parent process. This means that
     // a concurrent allocation in the parent which must be filled by the central
     // allocator (i.e. the thread cache bucket is empty) will block inside the
     // thread cache waiting for the lock to be released.
     //
     // In the child process, this allocation will never complete since this
     // thread will not be resumed. However, calling |Purge()| triggers the
     // reentrancy guard since the parent process thread was suspended from
     // within the thread cache.
     // Clear the guard to prevent this from crashing.
     tcache->is_in_thread_cache_ = false;
 #endif
     // There is a PA_DCHECK() in code called from |Purge()| checking that thread
     // cache memory accounting is correct. Since we are after fork() and the
     // other threads got interrupted mid-flight, this guarantee does not hold,
     // and we get inconsistent results.  Rather than giving up on checking this
     // invariant in regular code, reset it here so that the PA_DCHECK()
     // passes. See crbug.com/1216964.
     tcache->cached_memory_ = tcache->CachedMemory();

     // At this point, we should call |TryPurge|. However, due to the thread
     // cache being possibly inconsistent at this point, this may crash. Rather
     // than crash, we'd prefer to simply not purge, even though this may leak
     // memory in some cases.
     //
     // see crbug.com/1289092 for details of the crashes.

     tcache = tcache->next_;
   }
 }

 void ThreadCacheRegistry::SetLargestActiveBucketIndex(
     uint8_t largest_active_bucket_index) {
   largest_active_bucket_index_ = largest_active_bucket_index;
 }

 void ThreadCacheRegistry::SetThreadCacheMultiplier(float multiplier) {
   // Two steps:
   // - Set the global limits, which will affect newly created threads.
   // - Enumerate all thread caches and set the limit to the global one.
   {
     internal::ScopedGuard scoped_locker(GetLock());
     ThreadCache* tcache = list_head_;

     // If this is called before *any* thread cache has serviced *any*
     // allocation, which can happen in tests, and in theory in non-test code as
     // well.
     if (!tcache) {
       return;
     }

     // Setting the global limit while locked, because we need |tcache->root_|.
     ThreadCache::SetGlobalLimits(tcache->root_, multiplier);

     while (tcache) {
       PA_DCHECK(ThreadCache::IsValid(tcache));
       for (int index = 0; index < ThreadCache::kBucketCount; index++) {
         // This is racy, but we don't care if the limit is enforced later, and
         // we really want to avoid atomic instructions on the fast path.
         tcache->buckets_[index].limit.store(ThreadCache::global_limits_[index],
                                             std::memory_order_relaxed);
       }

       tcache = tcache->next_;
     }
   }
 }

 void ThreadCacheRegistry::SetPurgingConfiguration(
     const internal::base::TimeDelta min_purge_interval,
     const internal::base::TimeDelta max_purge_interval,
     const internal::base::TimeDelta default_purge_interval,
     size_t min_cached_memory_for_purging_bytes) {
   PA_CHECK(min_purge_interval <= default_purge_interval);
   PA_CHECK(default_purge_interval <= max_purge_interval);
   min_purge_interval_ = min_purge_interval;
   max_purge_interval_ = max_purge_interval;
   default_purge_interval_ = default_purge_interval;
   min_cached_memory_for_purging_bytes_ = min_cached_memory_for_purging_bytes;
   is_purging_configured_ = true;
 }

 void ThreadCacheRegistry::RunPeriodicPurge() {
   if (!periodic_purge_is_initialized_) {
     ThreadCache::EnsureThreadSpecificDataInitialized();
     periodic_purge_is_initialized_ = true;
   }

   PA_CHECK(is_purging_configured_);

   // Summing across all threads can be slow, but is necessary. Otherwise we rely
   // on the assumption that the current thread is a good proxy for overall
   // allocation activity. This is not the case for all process types.
   //
   // Since there is no synchronization with other threads, the value is stale,
   // which is fine.
   size_t cached_memory_approx = 0;
   {
     internal::ScopedGuard scoped_locker(GetLock());
     ThreadCache* tcache = list_head_;
     // Can run when there is no thread cache, in which case there is nothing to
     // do, and the task should not be rescheduled. This would typically indicate
     // a case where the thread cache was never enabled, or got disabled.
     if (!tcache) {
       return;
     }

     while (tcache) {
       cached_memory_approx += tcache->cached_memory_;
       tcache = tcache->next_;
     }
   }

   // If cached memory is low, this means that either memory footprint is fine,
   // or the process is mostly idle, and not allocating much since the last
   // purge. In this case, back off. On the other hand, if there is a lot of
   // cached memory, make purge more frequent, but always within a set frequency
   // range.
   //
   // There is a potential drawback: a process that was idle for a long time and
   // suddenly becomes very active will take some time to go back to regularly
   // scheduled purge with a small enough interval. This is the case for instance
   // of a renderer moving to foreground. To mitigate that, if cached memory
   // jumps is very large, make a greater leap to faster purging.
   if (cached_memory_approx > 10 * min_cached_memory_for_purging_bytes_) {
     periodic_purge_next_interval_ =
         std::min(default_purge_interval_, periodic_purge_next_interval_ / 2);
   } else if (cached_memory_approx > 2 * min_cached_memory_for_purging_bytes_) {
     periodic_purge_next_interval_ =
         std::max(min_purge_interval_, periodic_purge_next_interval_ / 2);
   } else if (cached_memory_approx < min_cached_memory_for_purging_bytes_) {
     periodic_purge_next_interval_ =
         std::min(max_purge_interval_, periodic_purge_next_interval_ * 2);
   }

   // Make sure that the next interval is in the right bounds. Even though the
   // logic above should eventually converge to a reasonable interval, if a
   // sleeping background thread holds onto a large amount of cached memory, then
   // |PurgeAll()| will not free any memory from it, and the first branch above
   // can be taken repeatedly until the interval gets very small, as the amount
   // of cached memory cannot change between calls (since we do not purge
   // background threads, but only ask them to purge their own cache at the next
   // allocation).
   periodic_purge_next_interval_ = std::clamp(
       periodic_purge_next_interval_, min_purge_interval_, max_purge_interval_);

   PurgeAll();
 }

 int64_t ThreadCacheRegistry::GetPeriodicPurgeNextIntervalInMicroseconds()
     const {
   return periodic_purge_next_interval_.InMicroseconds();
 }

 void ThreadCacheRegistry::ResetForTesting() {
   periodic_purge_next_interval_ = default_purge_interval_;
 }

 // static
 void ThreadCache::EnsureThreadSpecificDataInitialized() {
   // Using the registry lock to protect from concurrent initialization without
   // adding a special-pupose lock.
   internal::ScopedGuard scoped_locker(
       ThreadCacheRegistry::Instance().GetLock());
   if (g_thread_cache_key_created) {
     return;
   }

   bool ok = internal::PartitionTlsCreate(&internal::g_thread_cache_key, Delete);
   PA_CHECK(ok);
   g_thread_cache_key_created = true;
 }

 // static
 void ThreadCache::DeleteForTesting(ThreadCache* tcache) {
   ThreadCache::Delete(tcache);
 }

 // static
 void ThreadCache::SwapForTesting(PartitionRoot* root) {
   auto* old_tcache = ThreadCache::Get();
   g_thread_cache_root.store(nullptr, std::memory_order_relaxed);
   if (old_tcache) {
     ThreadCache::DeleteForTesting(old_tcache);
   }
   if (root) {
     Init(root);
     Create(root);
   } else {
 #if BUILDFLAG(IS_WIN)
     // OnDllProcessDetach accesses g_thread_cache_root which is nullptr now.
     internal::PartitionTlsSetOnDllProcessDetach(nullptr);
 #endif
   }
 }

 // static
 void ThreadCache::RemoveTombstoneForTesting() {
   PA_CHECK(IsTombstone(Get()));
   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
 }

 // static
 void ThreadCache::Init(PartitionRoot* root) {
 #if BUILDFLAG(IS_NACL)
   static_assert(false, "PartitionAlloc isn't supported for NaCl");
 #endif
   PA_CHECK(root->buckets[kBucketCount - 1].slot_size ==
            ThreadCache::kLargeSizeThreshold);
   PA_CHECK(root->buckets[largest_active_bucket_index_].slot_size ==
            ThreadCache::kDefaultSizeThreshold);

   EnsureThreadSpecificDataInitialized();

   // Make sure that only one PartitionRoot wants a thread cache.
   PartitionRoot* expected = nullptr;
   if (!g_thread_cache_root.compare_exchange_strong(expected, root,
                                                    std::memory_order_seq_cst,
                                                    std::memory_order_seq_cst)) {
     PA_CHECK(false)
         << "Only one PartitionRoot is allowed to have a thread cache";
   }

 #if BUILDFLAG(IS_WIN)
   internal::PartitionTlsSetOnDllProcessDetach(OnDllProcessDetach);
 #endif

   SetGlobalLimits(root, kDefaultMultiplier);
 }

 // static
 void ThreadCache::SetGlobalLimits(PartitionRoot* root, float multiplier) {
   size_t initial_value =
       static_cast<size_t>(kSmallBucketBaseCount) * multiplier;

   for (int index = 0; index < kBucketCount; index++) {
     const auto& root_bucket = root->buckets[index];
     // Invalid bucket.
     if (!root_bucket.active_slot_spans_head) {
       global_limits_[index] = 0;
       continue;
     }

     // Smaller allocations are more frequent, and more performance-sensitive.
     // Cache more small objects, and fewer larger ones, to save memory.
     size_t slot_size = root_bucket.slot_size;
     size_t value;
     if (slot_size <= 128) {
       value = initial_value;
     } else if (slot_size <= 256) {
       value = initial_value / 2;
     } else if (slot_size <= 512) {
       value = initial_value / 4;
     } else {
       value = initial_value / 8;
     }

     // Bare minimum so that malloc() / free() in a loop will not hit the central
     // allocator each time.
     constexpr size_t kMinLimit = 1;
     // |PutInBucket()| is called on a full bucket, which should not overflow.
     constexpr size_t kMaxLimit = std::numeric_limits<uint8_t>::max() - 1;
     global_limits_[index] =
         static_cast<uint8_t>(std::clamp(value, kMinLimit, kMaxLimit));
     PA_DCHECK(global_limits_[index] >= kMinLimit);
     PA_DCHECK(global_limits_[index] <= kMaxLimit);
   }
 }

 // static
 void ThreadCache::SetLargestCachedSize(size_t size) {
   if (size > ThreadCache::kLargeSizeThreshold) {
     size = ThreadCache::kLargeSizeThreshold;
   }
   largest_active_bucket_index_ = PartitionRoot::SizeToBucketIndex(
       size, PartitionRoot::BucketDistribution::kNeutral);
   PA_CHECK(largest_active_bucket_index_ < kBucketCount);
   ThreadCacheRegistry::Instance().SetLargestActiveBucketIndex(
       largest_active_bucket_index_);
 }

 // static
 ThreadCache* ThreadCache::Create(PartitionRoot* root) {
   PA_CHECK(root);
   // See comment in thread_cache.h, this is used to make sure
   // kThreadCacheNeedleArray is kept in the final binary.
   PA_CHECK(tools::kThreadCacheNeedleArray[0] == tools::kNeedle1);

   // Placement new and RawAlloc() are used, as otherwise when this partition is
   // the malloc() implementation, the memory allocated for the new thread cache
   // would make this code reentrant.
   //
   // This also means that deallocation must use RawFreeStatic(), hence the
   // operator delete() implementation below.
   size_t raw_size = root->AdjustSizeForExtrasAdd(sizeof(ThreadCache));
   size_t usable_size;
   bool already_zeroed;

   auto* bucket = root->buckets + PartitionRoot::SizeToBucketIndex(
                                      raw_size, root->GetBucketDistribution());
   uintptr_t buffer = root->RawAlloc<AllocFlags::kZeroFill>(
       bucket, raw_size, internal::PartitionPageSize(), &usable_size,
       &already_zeroed);
   ThreadCache* tcache =
       new (internal::SlotStartAddr2Ptr(buffer)) ThreadCache(root);

   // This may allocate.
   internal::PartitionTlsSet(internal::g_thread_cache_key, tcache);
 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
   // |thread_local| variables with destructors cause issues on some platforms.
   // Since we need a destructor (to empty the thread cache), we cannot use it
   // directly. However, TLS accesses with |thread_local| are typically faster,
   // as it can turn into a fixed offset load from a register (GS/FS on Linux
   // x86, for instance). On Windows, saving/restoring the last error increases
   // cost as well.
   //
   // To still get good performance, use |thread_local| to store a raw pointer,
   // and rely on the platform TLS to call the destructor.
   internal::g_thread_cache = tcache;
 #endif  // PA_CONFIG(THREAD_CACHE_FAST_TLS)

   return tcache;
 }

 ThreadCache::ThreadCache(PartitionRoot* root)
     : should_purge_(false),
       root_(root),
       thread_id_(internal::base::PlatformThread::CurrentId()),
       next_(nullptr),
       prev_(nullptr) {
   ThreadCacheRegistry::Instance().RegisterThreadCache(this);

   memset(&stats_, 0, sizeof(stats_));

   for (int index = 0; index < kBucketCount; index++) {
     const auto& root_bucket = root->buckets[index];
     Bucket* tcache_bucket = &buckets_[index];
     tcache_bucket->freelist_head = nullptr;
     tcache_bucket->count = 0;
     tcache_bucket->limit.store(global_limits_[index],
                                std::memory_order_relaxed);

     tcache_bucket->slot_size = root_bucket.slot_size;
     // Invalid bucket.
     if (!root_bucket.is_valid()) {
       // Explicitly set this, as size computations iterate over all buckets.
       tcache_bucket->limit.store(0, std::memory_order_relaxed);
     }
   }
 }

 ThreadCache::~ThreadCache() {
   ThreadCacheRegistry::Instance().UnregisterThreadCache(this);
   Purge();
 }

 // static
 void ThreadCache::Delete(void* tcache_ptr) {
   auto* tcache = static_cast<ThreadCache*>(tcache_ptr);

   if (!IsValid(tcache)) {
     return;
   }

 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
   internal::g_thread_cache = nullptr;
 #else
   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
 #endif

   auto* root = tcache->root_;
   tcache->~ThreadCache();
   // TreadCache was allocated using RawAlloc() and SlotStartAddr2Ptr(), so it
   // shifted by extras, but is MTE-tagged.
   root->RawFree(internal::SlotStartPtr2Addr(tcache_ptr));

 #if BUILDFLAG(IS_WIN)
   // On Windows, allocations do occur during thread/process teardown, make sure
   // they don't resurrect the thread cache.
   //
   // Don't MTE-tag, as it'd mess with the sentinel value.
   //
   // TODO(lizeb): Investigate whether this is needed on POSIX as well.
   internal::PartitionTlsSet(internal::g_thread_cache_key,
                             reinterpret_cast<void*>(kTombstone));
 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
   internal::g_thread_cache = reinterpret_cast<ThreadCache*>(kTombstone);
 #endif

 #endif  // BUILDFLAG(IS_WIN)
 }

 ThreadCache::Bucket::Bucket() {
   limit.store(0, std::memory_order_relaxed);
 }

 void ThreadCache::FillBucket(size_t bucket_index) {
   // Filling multiple elements from the central allocator at a time has several
   // advantages:
   // - Amortize lock acquisition
   // - Increase hit rate
   // - Can improve locality, as consecutive allocations from the central
   //   allocator will likely return close addresses, especially early on.
   //
   // However, do not take too many items, to prevent memory bloat.
   //
   // Cache filling / purging policy:
   // We aim at keeping the buckets neither empty nor full, while minimizing
   // requests to the central allocator.
   //
   // For each bucket, there is a |limit| of how many cached objects there are in
   // the bucket, so |count| < |limit| at all times.
   // - Clearing: limit -> limit / 2
   // - Filling: 0 -> limit / kBatchFillRatio
   //
   // These thresholds are somewhat arbitrary, with these considerations:
   // (1) Batched filling should not completely fill the bucket
   // (2) Batched clearing should not completely clear the bucket
   // (3) Batched filling should not be too eager
   //
   // If (1) and (2) do not hold, we risk oscillations of bucket filling /
   // clearing which would greatly increase calls to the central allocator. (3)
   // tries to keep memory usage low. So clearing half of the bucket, and filling
   // a quarter of it are sensible defaults.
   PA_INCREMENT_COUNTER(stats_.batch_fill_count);

   Bucket& bucket = buckets_[bucket_index];
   // Some buckets may have a limit lower than |kBatchFillRatio|, but we still
   // want to at least allocate a single slot, otherwise we wrongly return
   // nullptr, which ends up deactivating the bucket.
   //
   // In these cases, we do not really batch bucket filling, but this is expected
   // to be used for the largest buckets, where over-allocating is not advised.
   int count = std::max(
       1, bucket.limit.load(std::memory_order_relaxed) / kBatchFillRatio);

   size_t usable_size;
   bool is_already_zeroed;

   PA_DCHECK(!root_->buckets[bucket_index].CanStoreRawSize());
   PA_DCHECK(!root_->buckets[bucket_index].is_direct_mapped());

   size_t allocated_slots = 0;
   // Same as calling RawAlloc() |count| times, but acquires the lock only once.
   internal::ScopedGuard guard(internal::PartitionRootLock(root_));
   for (int i = 0; i < count; i++) {
     // Thread cache fill should not trigger expensive operations, to not grab
     // the lock for a long time needlessly, but also to not inflate memory
     // usage. Indeed, without AllocFlags::kFastPathOrReturnNull, cache
     // fill may activate a new PartitionPage, or even a new SuperPage, which is
     // clearly not desirable.
     //
     // |raw_size| is set to the slot size, as we don't know it. However, it is
     // only used for direct-mapped allocations and single-slot ones anyway,
     // which are not handled here.
     uintptr_t slot_start =
         root_->AllocFromBucket<AllocFlags::kFastPathOrReturnNull |
                                AllocFlags::kReturnNull>(
             &root_->buckets[bucket_index],
             root_->buckets[bucket_index].slot_size /* raw_size */,
             internal::PartitionPageSize(), &usable_size, &is_already_zeroed);

     // Either the previous allocation would require a slow path allocation, or
     // the central allocator is out of memory. If the bucket was filled with
     // some objects, then the allocation will be handled normally. Otherwise,
     // this goes to the central allocator, which will service the allocation,
     // return nullptr or crash.
     if (!slot_start) {
       break;
     }

     allocated_slots++;
     PutInBucket(bucket, slot_start);
   }

   cached_memory_ += allocated_slots * bucket.slot_size;
 }

 void ThreadCache::ClearBucket(Bucket& bucket, size_t limit) {
   ClearBucketHelper<true>(bucket, limit);
 }

 template <bool crash_on_corruption>
 void ThreadCache::ClearBucketHelper(Bucket& bucket, size_t limit) {
   // Avoids acquiring the lock needlessly.
   if (!bucket.count || bucket.count <= limit) {
     return;
   }

   // This serves two purposes: error checking and avoiding stalls when grabbing
   // the lock:
   // 1. Error checking: this is pretty clear. Since this path is taken
   //    infrequently, and is going to walk the entire freelist anyway, its
   //    incremental cost should be very small. Indeed, we free from the tail of
   //    the list, so all calls here will end up walking the entire freelist, and
   //    incurring the same amount of cache misses.
   // 2. Avoiding stalls: If one of the freelist accesses in |FreeAfter()|
   //    triggers a major page fault, and we are running on a low-priority
   //    thread, we don't want the thread to be blocked while holding the lock,
   //    causing a priority inversion.
   if constexpr (crash_on_corruption) {
     bucket.freelist_head->CheckFreeListForThreadCache(bucket.slot_size);
   }

   uint8_t count_before = bucket.count;
   if (limit == 0) {
     FreeAfter<crash_on_corruption>(bucket.freelist_head, bucket.slot_size);
     bucket.freelist_head = nullptr;
   } else {
     // Free the *end* of the list, not the head, since the head contains the
     // most recently touched memory.
     auto* head = bucket.freelist_head;
     size_t items = 1;  // Cannot free the freelist head.
     while (items < limit) {
       head = head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size);
       items++;
     }
     FreeAfter<crash_on_corruption>(
         head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size),
         bucket.slot_size);
     head->SetNext(nullptr);
   }
   bucket.count = limit;
   uint8_t count_after = bucket.count;
   size_t freed_memory = (count_before - count_after) * bucket.slot_size;
   PA_DCHECK(cached_memory_ >= freed_memory);
   cached_memory_ -= freed_memory;

   PA_DCHECK(cached_memory_ == CachedMemory());
 }

 template <bool crash_on_corruption>
 void ThreadCache::FreeAfter(internal::EncodedNextFreelistEntry* head,
                             size_t slot_size) {
   // Acquire the lock once. Deallocation from the same bucket are likely to be
   // hitting the same cache lines in the central allocator, and lock
   // acquisitions can be expensive.
   internal::ScopedGuard guard(internal::PartitionRootLock(root_));
   while (head) {
     uintptr_t slot_start = internal::SlotStartPtr2Addr(head);
     head = head->GetNextForThreadCache<crash_on_corruption>(slot_size);
     root_->RawFreeLocked(slot_start);
   }
 }

 void ThreadCache::ResetForTesting() {
   stats_.alloc_count = 0;
   stats_.alloc_hits = 0;
   stats_.alloc_misses = 0;

   stats_.alloc_miss_empty = 0;
   stats_.alloc_miss_too_large = 0;

   stats_.cache_fill_count = 0;
   stats_.cache_fill_hits = 0;
   stats_.cache_fill_misses = 0;

   stats_.batch_fill_count = 0;

   stats_.bucket_total_memory = 0;
   stats_.metadata_overhead = 0;

   Purge();
   PA_CHECK(cached_memory_ == 0u);
   should_purge_.store(false, std::memory_order_relaxed);
 }

 size_t ThreadCache::CachedMemory() const {
   size_t total = 0;
   for (const Bucket& bucket : buckets_) {
     total += bucket.count * static_cast<size_t>(bucket.slot_size);
   }

   return total;
 }

 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
   stats->alloc_count += stats_.alloc_count;
   stats->alloc_hits += stats_.alloc_hits;
   stats->alloc_misses += stats_.alloc_misses;

   stats->alloc_miss_empty += stats_.alloc_miss_empty;
   stats->alloc_miss_too_large += stats_.alloc_miss_too_large;

   stats->cache_fill_count += stats_.cache_fill_count;
   stats->cache_fill_hits += stats_.cache_fill_hits;
   stats->cache_fill_misses += stats_.cache_fill_misses;

   stats->batch_fill_count += stats_.batch_fill_count;

 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
   for (size_t i = 0; i < internal::kNumBuckets + 1; i++) {
     stats->allocs_per_bucket_[i] += stats_.allocs_per_bucket_[i];
   }
 #endif  // PA_CONFIG(THREAD_CACHE_ALLOC_STATS)

   // cached_memory_ is not necessarily equal to |CachedMemory()| here, since
   // this function can be called racily from another thread, to collect
   // statistics. Hence no DCHECK_EQ(CachedMemory(), cached_memory_).
   stats->bucket_total_memory += cached_memory_;

   stats->metadata_overhead += sizeof(*this);
 }

 void ThreadCache::SetShouldPurge() {
   should_purge_.store(true, std::memory_order_relaxed);
 }

 void ThreadCache::Purge() {
   PA_REENTRANCY_GUARD(is_in_thread_cache_);
   PurgeInternal();
 }

 void ThreadCache::TryPurge() {
   PA_REENTRANCY_GUARD(is_in_thread_cache_);
   PurgeInternalHelper<false>();
 }

 // static
 void ThreadCache::PurgeCurrentThread() {
   auto* tcache = Get();
   if (IsValid(tcache)) {
     tcache->Purge();
   }
 }

 void ThreadCache::PurgeInternal() {
   PurgeInternalHelper<true>();
 }

 void ThreadCache::ResetPerThreadAllocationStatsForTesting() {
   thread_alloc_stats_ = {};
 }

 template <bool crash_on_corruption>
 void ThreadCache::PurgeInternalHelper() {
   should_purge_.store(false, std::memory_order_relaxed);
   // TODO(lizeb): Investigate whether lock acquisition should be less
   // frequent.
   //
   // Note: iterate over all buckets, even the inactive ones. Since
   // |largest_active_bucket_index_| can be lowered at runtime, there may be
   // memory already cached in the inactive buckets. They should still be
   // purged.
   for (auto& bucket : buckets_) {
     ClearBucketHelper<crash_on_corruption>(bucket, 0);
   }
 }

 }  // namespace partition_alloc