base/allocator/partition_allocator/src/partition_alloc/thread_cache.h - platform/external/cronet - Git at Google

 // Copyright 2020 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_

 #include <atomic>
 #include <cstdint>
 #include <limits>
 #include <memory>

 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc-inl.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/compiler_specific.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/component_export.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/gtest_prod_util.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/thread_annotations.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/time/time.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_buildflags.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_config.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_forward.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_bucket_lookup.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_freelist_entry.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_lock.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_stats.h"
 #include "base/allocator/partition_allocator/src/partition_alloc/partition_tls.h"
 #include "build/build_config.h"

 #if defined(ARCH_CPU_X86_64) && BUILDFLAG(HAS_64_BIT_POINTERS)
 #include <algorithm>
 #endif

 namespace partition_alloc {

 class ThreadCache;

 namespace tools {

 // This is used from ThreadCacheInspector, which runs in a different process. It
 // scans the process memory looking for the two needles, to locate the thread
 // cache registry instance.
 //
 // These two values were chosen randomly, and in particular neither is a valid
 // pointer on most 64 bit architectures.
 #if BUILDFLAG(HAS_64_BIT_POINTERS)
 constexpr uintptr_t kNeedle1 = 0xe69e32f3ad9ea63;
 constexpr uintptr_t kNeedle2 = 0x9615ee1c5eb14caf;
 #else
 constexpr uintptr_t kNeedle1 = 0xe69e32f3;
 constexpr uintptr_t kNeedle2 = 0x9615ee1c;
 #endif  // BUILDFLAG(HAS_64_BIT_POINTERS)

 // This array contains, in order:
 // - kNeedle1
 // - &ThreadCacheRegistry::Instance()
 // - kNeedle2
 //
 // It is refererenced in the thread cache constructor to make sure it is not
 // removed by the compiler. It is also not const to make sure it ends up in
 // .data.
 constexpr size_t kThreadCacheNeedleArraySize = 4;
 extern uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize];

 class HeapDumper;
 class ThreadCacheInspector;

 }  // namespace tools

 namespace internal {

 extern PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;

 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
 extern PA_COMPONENT_EXPORT(
     PARTITION_ALLOC) thread_local ThreadCache* g_thread_cache;
 #endif

 }  // namespace internal

 struct ThreadCacheLimits {
   // When trying to conserve memory, set the thread cache limit to this.
   static constexpr size_t kDefaultSizeThreshold = 512;
   // 32kiB is chosen here as from local experiments, "zone" allocation in
   // V8 is performance-sensitive, and zones can (and do) grow up to 32kiB for
   // each individual allocation.
   static constexpr size_t kLargeSizeThreshold = 1 << 15;
   static_assert(kLargeSizeThreshold <= std::numeric_limits<uint16_t>::max(),
                 "");
 };

 constexpr internal::base::TimeDelta kMinPurgeInterval =
     internal::base::Seconds(1);
 constexpr internal::base::TimeDelta kMaxPurgeInterval =
     internal::base::Minutes(1);
 constexpr internal::base::TimeDelta kDefaultPurgeInterval =
     2 * kMinPurgeInterval;
 constexpr size_t kMinCachedMemoryForPurgingBytes = 500 * 1024;

 // Global registry of all ThreadCache instances.
 //
 // This class cannot allocate in the (Un)registerThreadCache() functions, as
 // they are called from ThreadCache constructor, which is from within the
 // allocator. However the other members can allocate.
 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCacheRegistry {
  public:
   static ThreadCacheRegistry& Instance();
   // Do not instantiate.
   //
   // Several things are surprising here:
   // - The constructor is public even though this is intended to be a singleton:
   //   we cannot use a "static local" variable in |Instance()| as this is
   //   reached too early during CRT initialization on Windows, meaning that
   //   static local variables don't work (as they call into the uninitialized
   //   runtime). To sidestep that, we use a regular global variable in the .cc,
   //   which is fine as this object's constructor is constexpr.
   // - Marked inline so that the chromium style plugin doesn't complain that a
   //   "complex constructor" has an inline body. This warning is disabled when
   //   the constructor is explicitly marked "inline". Note that this is a false
   //   positive of the plugin, since constexpr implies inline.
   inline constexpr ThreadCacheRegistry();

   void RegisterThreadCache(ThreadCache* cache);
   void UnregisterThreadCache(ThreadCache* cache);
   // Prints statistics for all thread caches, or this thread's only.
   void DumpStats(bool my_thread_only, ThreadCacheStats* stats);
   // Purge() this thread's cache, and asks the other ones to trigger Purge() at
   // a later point (during a deallocation).
   void PurgeAll();

   // Runs `PurgeAll` and updates the next interval which
   // `GetPeriodicPurgeNextIntervalInMicroseconds` returns.
   //
   // Note that it's a caller's responsibility to invoke this member function
   // periodically with an appropriate interval. This function does not schedule
   // any task nor timer.
   void RunPeriodicPurge();
   // Returns the appropriate interval to invoke `RunPeriodicPurge` next time.
   int64_t GetPeriodicPurgeNextIntervalInMicroseconds() const;

   // Controls the thread cache size, by setting the multiplier to a value above
   // or below |ThreadCache::kDefaultMultiplier|.
   void SetThreadCacheMultiplier(float multiplier);
   void SetLargestActiveBucketIndex(uint8_t largest_active_bucket_index);

   // Controls the thread cache purging configuration.
   void SetPurgingConfiguration(
       const internal::base::TimeDelta min_purge_interval,
       const internal::base::TimeDelta max_purge_interval,
       const internal::base::TimeDelta default_purge_interval,
       size_t min_cached_memory_for_purging_bytes);
   internal::base::TimeDelta min_purge_interval() const {
     return min_purge_interval_;
   }
   internal::base::TimeDelta max_purge_interval() const {
     return max_purge_interval_;
   }
   internal::base::TimeDelta default_purge_interval() const {
     return default_purge_interval_;
   }
   size_t min_cached_memory_for_purging_bytes() const {
     return min_cached_memory_for_purging_bytes_;
   }
   bool is_purging_configured() const { return is_purging_configured_; }

   static internal::Lock& GetLock() { return Instance().lock_; }
   // Purges all thread caches *now*. This is completely thread-unsafe, and
   // should only be called in a post-fork() handler.
   void ForcePurgeAllThreadAfterForkUnsafe();

   void ResetForTesting();

  private:
   friend class tools::ThreadCacheInspector;
   friend class tools::HeapDumper;

   // Not using base::Lock as the object's constructor must be constexpr.
   internal::Lock lock_;
   ThreadCache* list_head_ PA_GUARDED_BY(GetLock()) = nullptr;
   bool periodic_purge_is_initialized_ = false;
   internal::base::TimeDelta min_purge_interval_;
   internal::base::TimeDelta max_purge_interval_;
   internal::base::TimeDelta default_purge_interval_;
   size_t min_cached_memory_for_purging_bytes_ = 0u;
   internal::base::TimeDelta periodic_purge_next_interval_;
   bool is_purging_configured_ = false;

   uint8_t largest_active_bucket_index_ = internal::BucketIndexLookup::GetIndex(
       ThreadCacheLimits::kDefaultSizeThreshold);
 };

 constexpr ThreadCacheRegistry::ThreadCacheRegistry() = default;

 #if PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
 #define PA_INCREMENT_COUNTER(counter) ++counter
 #else
 #define PA_INCREMENT_COUNTER(counter) \
   do {                                \
   } while (0)
 #endif  // PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)

 #if BUILDFLAG(PA_DCHECK_IS_ON)

 namespace internal {

 class ReentrancyGuard {
  public:
   explicit ReentrancyGuard(bool& flag) : flag_(flag) {
     PA_CHECK(!flag_);
     flag_ = true;
   }

   ~ReentrancyGuard() { flag_ = false; }

  private:
   bool& flag_;
 };

 }  // namespace internal

 #define PA_REENTRANCY_GUARD(x)      \
   internal::ReentrancyGuard guard { \
     x                               \
   }

 #else  // BUILDFLAG(PA_DCHECK_IS_ON)

 #define PA_REENTRANCY_GUARD(x) \
   do {                         \
   } while (0)

 #endif  // BUILDFLAG(PA_DCHECK_IS_ON)

 // Per-thread cache. *Not* threadsafe, must only be accessed from a single
 // thread.
 //
 // In practice, this is easily enforced as long as only |instance| is
 // manipulated, as it is a thread_local member. As such, any
 // |ThreadCache::instance->*()| call will necessarily be done from a single
 // thread.
 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCache {
  public:
   // Initializes the thread cache for |root|. May allocate, so should be called
   // with the thread cache disabled on the partition side, and without the
   // partition lock held.
   //
   // May only be called by a single PartitionRoot.
   static void Init(PartitionRoot* root);

   static void DeleteForTesting(ThreadCache* tcache);

   // Deletes existing thread cache and creates a new one for |root|.
   static void SwapForTesting(PartitionRoot* root);

   // Removes the tombstone marker that would be returned by Get() otherwise.
   static void RemoveTombstoneForTesting();

   // Can be called several times, must be called before any ThreadCache
   // interactions.
   static void EnsureThreadSpecificDataInitialized();

   static ThreadCache* Get() {
 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
     return internal::g_thread_cache;
 #else
     // This region isn't MTE-tagged.
     return reinterpret_cast<ThreadCache*>(
         internal::PartitionTlsGet(internal::g_thread_cache_key));
 #endif
   }

   static bool IsValid(ThreadCache* tcache) {
     // Do not MTE-untag, as it'd mess up the sentinel value.
     return reinterpret_cast<uintptr_t>(tcache) & kTombstoneMask;
   }

   static bool IsTombstone(ThreadCache* tcache) {
     // Do not MTE-untag, as it'd mess up the sentinel value.
     return reinterpret_cast<uintptr_t>(tcache) == kTombstone;
   }

   // Create a new ThreadCache associated with |root|.
   // Must be called without the partition locked, as this may allocate.
   static ThreadCache* Create(PartitionRoot* root);

   ~ThreadCache();

   // Force placement new.
   void* operator new(size_t) = delete;
   void* operator new(size_t, void* buffer) { return buffer; }
   void operator delete(void* ptr) = delete;
   ThreadCache(const ThreadCache&) = delete;
   ThreadCache(const ThreadCache&&) = delete;
   ThreadCache& operator=(const ThreadCache&) = delete;

   // Tries to put a slot at |slot_start| into the cache.
   // The slot comes from the bucket at index |bucket_index| from the partition
   // this cache is for.
   //
   // Returns true if the slot was put in the cache, and false otherwise. This
   // can happen either because the cache is full or the allocation was too
   // large.
   PA_ALWAYS_INLINE bool MaybePutInCache(uintptr_t slot_start,
                                         size_t bucket_index,
                                         size_t* slot_size);

   // Tries to allocate a memory slot from the cache.
   // Returns 0 on failure.
   //
   // Has the same behavior as RawAlloc(), that is: no cookie nor ref-count
   // handling. Sets |slot_size| to the allocated size upon success.
   PA_ALWAYS_INLINE uintptr_t GetFromCache(size_t bucket_index,
                                           size_t* slot_size);

   // Asks this cache to trigger |Purge()| at a later point. Can be called from
   // any thread.
   void SetShouldPurge();
   // Empties the cache.
   // The Partition lock must *not* be held when calling this.
   // Must be called from the thread this cache is for.
   void Purge();
   // |TryPurge| is the same as |Purge|, except that |TryPurge| will
   // not crash if the thread cache is inconsistent. Normally inconsistency
   // is a sign of a bug somewhere, so |Purge| should be preferred in most cases.
   void TryPurge();
   // Amount of cached memory for this thread's cache, in bytes.
   size_t CachedMemory() const;
   void AccumulateStats(ThreadCacheStats* stats) const;

   // Purge the thread cache of the current thread, if one exists.
   static void PurgeCurrentThread();

   const ThreadAllocStats& thread_alloc_stats() const {
     return thread_alloc_stats_;
   }
   size_t bucket_count_for_testing(size_t index) const {
     return buckets_[index].count;
   }

   internal::base::PlatformThreadId thread_id() const { return thread_id_; }

   // Sets the maximum size of allocations that may be cached by the thread
   // cache. This applies to all threads. However, the maximum size is bounded by
   // |kLargeSizeThreshold|.
   static void SetLargestCachedSize(size_t size);

   // Cumulative stats about *all* allocations made on the `root_` partition on
   // this thread, that is not only the allocations serviced by the thread cache,
   // but all allocations, including large and direct-mapped ones. This should in
   // theory be split into a separate PerThread data structure, but the thread
   // cache is the only per-thread data we have as of now.
   //
   // TODO(lizeb): Investigate adding a proper per-thread data structure.
   PA_ALWAYS_INLINE void RecordAllocation(size_t size);
   PA_ALWAYS_INLINE void RecordDeallocation(size_t size);
   void ResetPerThreadAllocationStatsForTesting();

   // Fill 1 / kBatchFillRatio * bucket.limit slots at a time.
   static constexpr uint16_t kBatchFillRatio = 8;

   // Limit for the smallest bucket will be kDefaultMultiplier *
   // kSmallBucketBaseCount by default.
   static constexpr float kDefaultMultiplier = 2.;
   static constexpr uint8_t kSmallBucketBaseCount = 64;

   static constexpr size_t kDefaultSizeThreshold =
       ThreadCacheLimits::kDefaultSizeThreshold;
   static constexpr size_t kLargeSizeThreshold =
       ThreadCacheLimits::kLargeSizeThreshold;

   const ThreadCache* prev_for_testing() const
       PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
     return prev_;
   }
   const ThreadCache* next_for_testing() const
       PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
     return next_;
   }

  private:
   friend class tools::HeapDumper;
   friend class tools::ThreadCacheInspector;

   struct Bucket {
     internal::EncodedNextFreelistEntry* freelist_head = nullptr;
     // Want to keep sizeof(Bucket) small, using small types.
     uint8_t count = 0;
     std::atomic<uint8_t> limit{};  // Can be changed from another thread.
     uint16_t slot_size = 0;

     Bucket();
   };
   static_assert(sizeof(Bucket) <= 2 * sizeof(void*), "Keep Bucket small.");

   explicit ThreadCache(PartitionRoot* root);
   static void Delete(void* thread_cache_ptr);

   void PurgeInternal();
   template <bool crash_on_corruption>
   void PurgeInternalHelper();

   // Fills a bucket from the central allocator.
   void FillBucket(size_t bucket_index);
   // Empties the |bucket| until there are at most |limit| objects in it.
   template <bool crash_on_corruption>
   void ClearBucketHelper(Bucket& bucket, size_t limit);
   void ClearBucket(Bucket& bucket, size_t limit);
   PA_ALWAYS_INLINE void PutInBucket(Bucket& bucket, uintptr_t slot_start);
   void ResetForTesting();
   // Releases the entire freelist starting at |head| to the root.
   template <bool crash_on_corruption>
   void FreeAfter(internal::EncodedNextFreelistEntry* head, size_t slot_size);
   static void SetGlobalLimits(PartitionRoot* root, float multiplier);

   static constexpr uint16_t kBucketCount =
       internal::BucketIndexLookup::GetIndex(ThreadCache::kLargeSizeThreshold) +
       1;
   static_assert(
       kBucketCount < internal::kNumBuckets,
       "Cannot have more cached buckets than what the allocator supports");

   // On some architectures, ThreadCache::Get() can be called and return
   // something after the thread cache has been destroyed. In this case, we set
   // it to this value, to signal that the thread is being terminated, and the
   // thread cache should not be used.
   //
   // This happens in particular on Windows, during program termination.
   //
   // We choose 0x1 as the value as it is an invalid pointer value, since it is
   // not aligned, and too low. Also, checking !(ptr & kTombstoneMask) checks for
   // nullptr and kTombstone at the same time.
   static constexpr uintptr_t kTombstone = 0x1;
   static constexpr uintptr_t kTombstoneMask = ~kTombstone;

   static uint8_t global_limits_[kBucketCount];
   // Index of the largest active bucket. Not all processes/platforms will use
   // all buckets, as using larger buckets increases the memory footprint.
   //
   // TODO(lizeb): Investigate making this per-thread rather than static, to
   // improve locality, and open the door to per-thread settings.
   static uint16_t largest_active_bucket_index_;

   // These are at the beginning as they're accessed for each allocation.
   uint32_t cached_memory_ = 0;
   std::atomic<bool> should_purge_;
   ThreadCacheStats stats_;
   ThreadAllocStats thread_alloc_stats_;

   // Buckets are quite big, though each is only 2 pointers.
   Bucket buckets_[kBucketCount];

   // Cold data below.
   PartitionRoot* const root_;

   const internal::base::PlatformThreadId thread_id_;
 #if BUILDFLAG(PA_DCHECK_IS_ON)
   bool is_in_thread_cache_ = false;
 #endif

   // Intrusive list since ThreadCacheRegistry::RegisterThreadCache() cannot
   // allocate.
   ThreadCache* next_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
   ThreadCache* prev_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());

   friend class ThreadCacheRegistry;
   friend class PartitionAllocThreadCacheTest;
   friend class tools::ThreadCacheInspector;
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, Simple);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               MultipleObjectsCachedPerBucket);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               LargeAllocationsAreNotCached);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               MultipleThreadCaches);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, RecordStats);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               ThreadCacheRegistry);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               MultipleThreadCachesAccounting);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               DynamicCountPerBucket);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               DynamicCountPerBucketClamping);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               DynamicCountPerBucketMultipleThreads);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               DynamicSizeThreshold);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
                               DynamicSizeThresholdPurge);
   PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, ClearFromTail);
 };

 PA_ALWAYS_INLINE bool ThreadCache::MaybePutInCache(uintptr_t slot_start,
                                                    size_t bucket_index,
                                                    size_t* slot_size) {
   PA_REENTRANCY_GUARD(is_in_thread_cache_);
   PA_INCREMENT_COUNTER(stats_.cache_fill_count);

   if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
     PA_INCREMENT_COUNTER(stats_.cache_fill_misses);
     return false;
   }

   auto& bucket = buckets_[bucket_index];

   PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);

   PutInBucket(bucket, slot_start);
   cached_memory_ += bucket.slot_size;
   PA_INCREMENT_COUNTER(stats_.cache_fill_hits);

   // Relaxed ordering: we don't care about having an up-to-date or consistent
   // value, just want it to not change while we are using it, hence using
   // relaxed ordering, and loading into a local variable. Without it, we are
   // gambling that the compiler would not issue multiple loads.
   uint8_t limit = bucket.limit.load(std::memory_order_relaxed);
   // Batched deallocation, amortizing lock acquisitions.
   if (PA_UNLIKELY(bucket.count > limit)) {
     ClearBucket(bucket, limit / 2);
   }

   if (PA_UNLIKELY(should_purge_.load(std::memory_order_relaxed))) {
     PurgeInternal();
   }

   *slot_size = bucket.slot_size;
   return true;
 }

 PA_ALWAYS_INLINE uintptr_t ThreadCache::GetFromCache(size_t bucket_index,
                                                      size_t* slot_size) {
 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
   stats_.allocs_per_bucket_[bucket_index]++;
 #endif

   PA_REENTRANCY_GUARD(is_in_thread_cache_);
   PA_INCREMENT_COUNTER(stats_.alloc_count);
   // Only handle "small" allocations.
   if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
     PA_INCREMENT_COUNTER(stats_.alloc_miss_too_large);
     PA_INCREMENT_COUNTER(stats_.alloc_misses);
     return 0;
   }

   auto& bucket = buckets_[bucket_index];
   if (PA_LIKELY(bucket.freelist_head)) {
     PA_INCREMENT_COUNTER(stats_.alloc_hits);
   } else {
     PA_DCHECK(bucket.count == 0);
     PA_INCREMENT_COUNTER(stats_.alloc_miss_empty);
     PA_INCREMENT_COUNTER(stats_.alloc_misses);

     FillBucket(bucket_index);

     // Very unlikely, means that the central allocator is out of memory. Let it
     // deal with it (may return 0, may crash).
     if (PA_UNLIKELY(!bucket.freelist_head)) {
       return 0;
     }
   }

   PA_DCHECK(bucket.count != 0);
   internal::EncodedNextFreelistEntry* entry = bucket.freelist_head;
   // TODO(lizeb): Consider removing once crbug.com/1382658 is fixed.
 #if BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) && \
     BUILDFLAG(HAS_64_BIT_POINTERS)
   // x86_64 architecture now supports 57 bits of address space, as of Ice Lake
   // for Intel. However Chrome OS systems do not ship with kernel support for
   // it, but with 48 bits, so all canonical addresses have the upper 16 bits
   // zeroed (17 in practice, since the upper half of address space is reserved
   // by the kernel).
   constexpr uintptr_t kCanonicalPointerMask = (1ULL << 48) - 1;
   PA_CHECK(!(reinterpret_cast<uintptr_t>(entry) & ~kCanonicalPointerMask));
 #endif  // BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) &&
         // BUILDFLAG(HAS_64_BIT_POINTERS)

   // Passes the bucket size to |GetNext()|, so that in case of freelist
   // corruption, we know the bucket size that lead to the crash, helping to
   // narrow down the search for culprit. |bucket| was touched just now, so this
   // does not introduce another cache miss.
   internal::EncodedNextFreelistEntry* next =
       entry->GetNextForThreadCache<true>(bucket.slot_size);
   PA_DCHECK(entry != next);
   bucket.count--;
   PA_DCHECK(bucket.count != 0 || !next);
   bucket.freelist_head = next;
   *slot_size = bucket.slot_size;

   PA_DCHECK(cached_memory_ >= bucket.slot_size);
   cached_memory_ -= bucket.slot_size;

   return internal::SlotStartPtr2Addr(entry);
 }

 PA_ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket,
                                                uintptr_t slot_start) {
 #if PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) && \
     BUILDFLAG(HAS_64_BIT_POINTERS)
   // We see freelist corruption crashes happening in the wild.  These are likely
   // due to out-of-bounds accesses in the previous slot, or to a Use-After-Free
   // somewhere in the code.
   //
   // The issue is that we detect the UaF far away from the place where it
   // happens. As a consequence, we should try to make incorrect code crash as
   // early as possible. Poisoning memory at free() time works for UaF, but it
   // was seen in the past to incur a high performance cost.
   //
   // Here, only poison the current cacheline, which we are touching anyway.
   // TODO(lizeb): Make sure this does not hurt performance.

   // Everything below requires this alignment.
   static_assert(internal::kAlignment == 16, "");

   // The pointer is always 16 bytes aligned, so its start address is always == 0
   // % 16. Its distance to the next cacheline is
   //   `64 - ((slot_start & 63) / 16) * 16`
   static_assert(
       internal::kPartitionCachelineSize == 64,
       "The computation below assumes that cache lines are 64 bytes long.");
   int distance_to_next_cacheline_in_16_bytes = 4 - ((slot_start >> 4) & 3);
   int slot_size_remaining_in_16_bytes =
 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
       // When BRP is on in the "previous slot" mode, this slot may have a BRP
       // ref-count of the next, potentially allocated slot. Make sure we don't
       // overwrite it.
       (bucket.slot_size - sizeof(PartitionRefCount)) / 16;
 #else
       bucket.slot_size / 16;
 #endif  // BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)

   slot_size_remaining_in_16_bytes = std::min(
       slot_size_remaining_in_16_bytes, distance_to_next_cacheline_in_16_bytes);

   static const uint32_t poison_16_bytes[4] = {0xbadbad00, 0xbadbad00,
                                               0xbadbad00, 0xbadbad00};
   // Give a hint to the compiler in hope it'll vectorize the loop.
 #if PA_HAS_BUILTIN(__builtin_assume_aligned)
   void* slot_start_tagged = __builtin_assume_aligned(
       internal::SlotStartAddr2Ptr(slot_start), internal::kAlignment);
 #else
   void* slot_start_tagged = internal::SlotStartAddr2Ptr(slot_start);
 #endif
   uint32_t* address_aligned = static_cast<uint32_t*>(slot_start_tagged);
   for (int i = 0; i < slot_size_remaining_in_16_bytes; i++) {
     // Clang will expand the memcpy to a 16-byte write (movups on x86).
     memcpy(address_aligned, poison_16_bytes, sizeof(poison_16_bytes));
     address_aligned += 4;
   }
 #endif  // PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) &&
         // BUILDFLAG(HAS_64_BIT_POINTERS)

   auto* entry =
       internal::EncodedNextFreelistEntry::EmplaceAndInitForThreadCache(
           slot_start, bucket.freelist_head);
   bucket.freelist_head = entry;
   bucket.count++;
 }

 PA_ALWAYS_INLINE void ThreadCache::RecordAllocation(size_t size) {
   thread_alloc_stats_.alloc_count++;
   thread_alloc_stats_.alloc_total_size += size;
 }

 PA_ALWAYS_INLINE void ThreadCache::RecordDeallocation(size_t size) {
   thread_alloc_stats_.dealloc_count++;
   thread_alloc_stats_.dealloc_total_size += size;
 }

 }  // namespace partition_alloc

 #endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_