blob: 2bee33d41fd24f300e103aa827a1bfeb05f146a1 [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
#include <atomic>
#include <cstdint>
#include <limits>
#include <memory>
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc-inl.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/compiler_specific.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/component_export.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/gtest_prod_util.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/thread_annotations.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_base/time/time.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_buildflags.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_config.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_alloc_forward.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_bucket_lookup.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_freelist_entry.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_lock.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_stats.h"
#include "base/allocator/partition_allocator/src/partition_alloc/partition_tls.h"
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_64) && BUILDFLAG(HAS_64_BIT_POINTERS)
#include <algorithm>
#endif
namespace partition_alloc {
class ThreadCache;
namespace tools {
// This is used from ThreadCacheInspector, which runs in a different process. It
// scans the process memory looking for the two needles, to locate the thread
// cache registry instance.
//
// These two values were chosen randomly, and in particular neither is a valid
// pointer on most 64 bit architectures.
#if BUILDFLAG(HAS_64_BIT_POINTERS)
constexpr uintptr_t kNeedle1 = 0xe69e32f3ad9ea63;
constexpr uintptr_t kNeedle2 = 0x9615ee1c5eb14caf;
#else
constexpr uintptr_t kNeedle1 = 0xe69e32f3;
constexpr uintptr_t kNeedle2 = 0x9615ee1c;
#endif // BUILDFLAG(HAS_64_BIT_POINTERS)
// This array contains, in order:
// - kNeedle1
// - &ThreadCacheRegistry::Instance()
// - kNeedle2
//
// It is refererenced in the thread cache constructor to make sure it is not
// removed by the compiler. It is also not const to make sure it ends up in
// .data.
constexpr size_t kThreadCacheNeedleArraySize = 4;
extern uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize];
class HeapDumper;
class ThreadCacheInspector;
} // namespace tools
namespace internal {
extern PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
#if PA_CONFIG(THREAD_CACHE_FAST_TLS)
extern PA_COMPONENT_EXPORT(
PARTITION_ALLOC) thread_local ThreadCache* g_thread_cache;
#endif
} // namespace internal
struct ThreadCacheLimits {
// When trying to conserve memory, set the thread cache limit to this.
static constexpr size_t kDefaultSizeThreshold = 512;
// 32kiB is chosen here as from local experiments, "zone" allocation in
// V8 is performance-sensitive, and zones can (and do) grow up to 32kiB for
// each individual allocation.
static constexpr size_t kLargeSizeThreshold = 1 << 15;
static_assert(kLargeSizeThreshold <= std::numeric_limits<uint16_t>::max(),
"");
};
constexpr internal::base::TimeDelta kMinPurgeInterval =
internal::base::Seconds(1);
constexpr internal::base::TimeDelta kMaxPurgeInterval =
internal::base::Minutes(1);
constexpr internal::base::TimeDelta kDefaultPurgeInterval =
2 * kMinPurgeInterval;
constexpr size_t kMinCachedMemoryForPurgingBytes = 500 * 1024;
// Global registry of all ThreadCache instances.
//
// This class cannot allocate in the (Un)registerThreadCache() functions, as
// they are called from ThreadCache constructor, which is from within the
// allocator. However the other members can allocate.
class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCacheRegistry {
public:
static ThreadCacheRegistry& Instance();
// Do not instantiate.
//
// Several things are surprising here:
// - The constructor is public even though this is intended to be a singleton:
// we cannot use a "static local" variable in |Instance()| as this is
// reached too early during CRT initialization on Windows, meaning that
// static local variables don't work (as they call into the uninitialized
// runtime). To sidestep that, we use a regular global variable in the .cc,
// which is fine as this object's constructor is constexpr.
// - Marked inline so that the chromium style plugin doesn't complain that a
// "complex constructor" has an inline body. This warning is disabled when
// the constructor is explicitly marked "inline". Note that this is a false
// positive of the plugin, since constexpr implies inline.
inline constexpr ThreadCacheRegistry();
void RegisterThreadCache(ThreadCache* cache);
void UnregisterThreadCache(ThreadCache* cache);
// Prints statistics for all thread caches, or this thread's only.
void DumpStats(bool my_thread_only, ThreadCacheStats* stats);
// Purge() this thread's cache, and asks the other ones to trigger Purge() at
// a later point (during a deallocation).
void PurgeAll();
// Runs `PurgeAll` and updates the next interval which
// `GetPeriodicPurgeNextIntervalInMicroseconds` returns.
//
// Note that it's a caller's responsibility to invoke this member function
// periodically with an appropriate interval. This function does not schedule
// any task nor timer.
void RunPeriodicPurge();
// Returns the appropriate interval to invoke `RunPeriodicPurge` next time.
int64_t GetPeriodicPurgeNextIntervalInMicroseconds() const;
// Controls the thread cache size, by setting the multiplier to a value above
// or below |ThreadCache::kDefaultMultiplier|.
void SetThreadCacheMultiplier(float multiplier);
void SetLargestActiveBucketIndex(uint8_t largest_active_bucket_index);
// Controls the thread cache purging configuration.
void SetPurgingConfiguration(
const internal::base::TimeDelta min_purge_interval,
const internal::base::TimeDelta max_purge_interval,
const internal::base::TimeDelta default_purge_interval,
size_t min_cached_memory_for_purging_bytes);
internal::base::TimeDelta min_purge_interval() const {
return min_purge_interval_;
}
internal::base::TimeDelta max_purge_interval() const {
return max_purge_interval_;
}
internal::base::TimeDelta default_purge_interval() const {
return default_purge_interval_;
}
size_t min_cached_memory_for_purging_bytes() const {
return min_cached_memory_for_purging_bytes_;
}
bool is_purging_configured() const { return is_purging_configured_; }
static internal::Lock& GetLock() { return Instance().lock_; }
// Purges all thread caches *now*. This is completely thread-unsafe, and
// should only be called in a post-fork() handler.
void ForcePurgeAllThreadAfterForkUnsafe();
void ResetForTesting();
private:
friend class tools::ThreadCacheInspector;
friend class tools::HeapDumper;
// Not using base::Lock as the object's constructor must be constexpr.
internal::Lock lock_;
ThreadCache* list_head_ PA_GUARDED_BY(GetLock()) = nullptr;
bool periodic_purge_is_initialized_ = false;
internal::base::TimeDelta min_purge_interval_;
internal::base::TimeDelta max_purge_interval_;
internal::base::TimeDelta default_purge_interval_;
size_t min_cached_memory_for_purging_bytes_ = 0u;
internal::base::TimeDelta periodic_purge_next_interval_;
bool is_purging_configured_ = false;
uint8_t largest_active_bucket_index_ = internal::BucketIndexLookup::GetIndex(
ThreadCacheLimits::kDefaultSizeThreshold);
};
constexpr ThreadCacheRegistry::ThreadCacheRegistry() = default;
#if PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
#define PA_INCREMENT_COUNTER(counter) ++counter
#else
#define PA_INCREMENT_COUNTER(counter) \
do { \
} while (0)
#endif // PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
#if BUILDFLAG(PA_DCHECK_IS_ON)
namespace internal {
class ReentrancyGuard {
public:
explicit ReentrancyGuard(bool& flag) : flag_(flag) {
PA_CHECK(!flag_);
flag_ = true;
}
~ReentrancyGuard() { flag_ = false; }
private:
bool& flag_;
};
} // namespace internal
#define PA_REENTRANCY_GUARD(x) \
internal::ReentrancyGuard guard { \
x \
}
#else // BUILDFLAG(PA_DCHECK_IS_ON)
#define PA_REENTRANCY_GUARD(x) \
do { \
} while (0)
#endif // BUILDFLAG(PA_DCHECK_IS_ON)
// Per-thread cache. *Not* threadsafe, must only be accessed from a single
// thread.
//
// In practice, this is easily enforced as long as only |instance| is
// manipulated, as it is a thread_local member. As such, any
// |ThreadCache::instance->*()| call will necessarily be done from a single
// thread.
class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCache {
public:
// Initializes the thread cache for |root|. May allocate, so should be called
// with the thread cache disabled on the partition side, and without the
// partition lock held.
//
// May only be called by a single PartitionRoot.
static void Init(PartitionRoot* root);
static void DeleteForTesting(ThreadCache* tcache);
// Deletes existing thread cache and creates a new one for |root|.
static void SwapForTesting(PartitionRoot* root);
// Removes the tombstone marker that would be returned by Get() otherwise.
static void RemoveTombstoneForTesting();
// Can be called several times, must be called before any ThreadCache
// interactions.
static void EnsureThreadSpecificDataInitialized();
static ThreadCache* Get() {
#if PA_CONFIG(THREAD_CACHE_FAST_TLS)
return internal::g_thread_cache;
#else
// This region isn't MTE-tagged.
return reinterpret_cast<ThreadCache*>(
internal::PartitionTlsGet(internal::g_thread_cache_key));
#endif
}
static bool IsValid(ThreadCache* tcache) {
// Do not MTE-untag, as it'd mess up the sentinel value.
return reinterpret_cast<uintptr_t>(tcache) & kTombstoneMask;
}
static bool IsTombstone(ThreadCache* tcache) {
// Do not MTE-untag, as it'd mess up the sentinel value.
return reinterpret_cast<uintptr_t>(tcache) == kTombstone;
}
// Create a new ThreadCache associated with |root|.
// Must be called without the partition locked, as this may allocate.
static ThreadCache* Create(PartitionRoot* root);
~ThreadCache();
// Force placement new.
void* operator new(size_t) = delete;
void* operator new(size_t, void* buffer) { return buffer; }
void operator delete(void* ptr) = delete;
ThreadCache(const ThreadCache&) = delete;
ThreadCache(const ThreadCache&&) = delete;
ThreadCache& operator=(const ThreadCache&) = delete;
// Tries to put a slot at |slot_start| into the cache.
// The slot comes from the bucket at index |bucket_index| from the partition
// this cache is for.
//
// Returns true if the slot was put in the cache, and false otherwise. This
// can happen either because the cache is full or the allocation was too
// large.
PA_ALWAYS_INLINE bool MaybePutInCache(uintptr_t slot_start,
size_t bucket_index,
size_t* slot_size);
// Tries to allocate a memory slot from the cache.
// Returns 0 on failure.
//
// Has the same behavior as RawAlloc(), that is: no cookie nor ref-count
// handling. Sets |slot_size| to the allocated size upon success.
PA_ALWAYS_INLINE uintptr_t GetFromCache(size_t bucket_index,
size_t* slot_size);
// Asks this cache to trigger |Purge()| at a later point. Can be called from
// any thread.
void SetShouldPurge();
// Empties the cache.
// The Partition lock must *not* be held when calling this.
// Must be called from the thread this cache is for.
void Purge();
// |TryPurge| is the same as |Purge|, except that |TryPurge| will
// not crash if the thread cache is inconsistent. Normally inconsistency
// is a sign of a bug somewhere, so |Purge| should be preferred in most cases.
void TryPurge();
// Amount of cached memory for this thread's cache, in bytes.
size_t CachedMemory() const;
void AccumulateStats(ThreadCacheStats* stats) const;
// Purge the thread cache of the current thread, if one exists.
static void PurgeCurrentThread();
const ThreadAllocStats& thread_alloc_stats() const {
return thread_alloc_stats_;
}
size_t bucket_count_for_testing(size_t index) const {
return buckets_[index].count;
}
internal::base::PlatformThreadId thread_id() const { return thread_id_; }
// Sets the maximum size of allocations that may be cached by the thread
// cache. This applies to all threads. However, the maximum size is bounded by
// |kLargeSizeThreshold|.
static void SetLargestCachedSize(size_t size);
// Cumulative stats about *all* allocations made on the `root_` partition on
// this thread, that is not only the allocations serviced by the thread cache,
// but all allocations, including large and direct-mapped ones. This should in
// theory be split into a separate PerThread data structure, but the thread
// cache is the only per-thread data we have as of now.
//
// TODO(lizeb): Investigate adding a proper per-thread data structure.
PA_ALWAYS_INLINE void RecordAllocation(size_t size);
PA_ALWAYS_INLINE void RecordDeallocation(size_t size);
void ResetPerThreadAllocationStatsForTesting();
// Fill 1 / kBatchFillRatio * bucket.limit slots at a time.
static constexpr uint16_t kBatchFillRatio = 8;
// Limit for the smallest bucket will be kDefaultMultiplier *
// kSmallBucketBaseCount by default.
static constexpr float kDefaultMultiplier = 2.;
static constexpr uint8_t kSmallBucketBaseCount = 64;
static constexpr size_t kDefaultSizeThreshold =
ThreadCacheLimits::kDefaultSizeThreshold;
static constexpr size_t kLargeSizeThreshold =
ThreadCacheLimits::kLargeSizeThreshold;
const ThreadCache* prev_for_testing() const
PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
return prev_;
}
const ThreadCache* next_for_testing() const
PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
return next_;
}
private:
friend class tools::HeapDumper;
friend class tools::ThreadCacheInspector;
struct Bucket {
internal::EncodedNextFreelistEntry* freelist_head = nullptr;
// Want to keep sizeof(Bucket) small, using small types.
uint8_t count = 0;
std::atomic<uint8_t> limit{}; // Can be changed from another thread.
uint16_t slot_size = 0;
Bucket();
};
static_assert(sizeof(Bucket) <= 2 * sizeof(void*), "Keep Bucket small.");
explicit ThreadCache(PartitionRoot* root);
static void Delete(void* thread_cache_ptr);
void PurgeInternal();
template <bool crash_on_corruption>
void PurgeInternalHelper();
// Fills a bucket from the central allocator.
void FillBucket(size_t bucket_index);
// Empties the |bucket| until there are at most |limit| objects in it.
template <bool crash_on_corruption>
void ClearBucketHelper(Bucket& bucket, size_t limit);
void ClearBucket(Bucket& bucket, size_t limit);
PA_ALWAYS_INLINE void PutInBucket(Bucket& bucket, uintptr_t slot_start);
void ResetForTesting();
// Releases the entire freelist starting at |head| to the root.
template <bool crash_on_corruption>
void FreeAfter(internal::EncodedNextFreelistEntry* head, size_t slot_size);
static void SetGlobalLimits(PartitionRoot* root, float multiplier);
static constexpr uint16_t kBucketCount =
internal::BucketIndexLookup::GetIndex(ThreadCache::kLargeSizeThreshold) +
1;
static_assert(
kBucketCount < internal::kNumBuckets,
"Cannot have more cached buckets than what the allocator supports");
// On some architectures, ThreadCache::Get() can be called and return
// something after the thread cache has been destroyed. In this case, we set
// it to this value, to signal that the thread is being terminated, and the
// thread cache should not be used.
//
// This happens in particular on Windows, during program termination.
//
// We choose 0x1 as the value as it is an invalid pointer value, since it is
// not aligned, and too low. Also, checking !(ptr & kTombstoneMask) checks for
// nullptr and kTombstone at the same time.
static constexpr uintptr_t kTombstone = 0x1;
static constexpr uintptr_t kTombstoneMask = ~kTombstone;
static uint8_t global_limits_[kBucketCount];
// Index of the largest active bucket. Not all processes/platforms will use
// all buckets, as using larger buckets increases the memory footprint.
//
// TODO(lizeb): Investigate making this per-thread rather than static, to
// improve locality, and open the door to per-thread settings.
static uint16_t largest_active_bucket_index_;
// These are at the beginning as they're accessed for each allocation.
uint32_t cached_memory_ = 0;
std::atomic<bool> should_purge_;
ThreadCacheStats stats_;
ThreadAllocStats thread_alloc_stats_;
// Buckets are quite big, though each is only 2 pointers.
Bucket buckets_[kBucketCount];
// Cold data below.
PartitionRoot* const root_;
const internal::base::PlatformThreadId thread_id_;
#if BUILDFLAG(PA_DCHECK_IS_ON)
bool is_in_thread_cache_ = false;
#endif
// Intrusive list since ThreadCacheRegistry::RegisterThreadCache() cannot
// allocate.
ThreadCache* next_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
ThreadCache* prev_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
friend class ThreadCacheRegistry;
friend class PartitionAllocThreadCacheTest;
friend class tools::ThreadCacheInspector;
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, Simple);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
MultipleObjectsCachedPerBucket);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
LargeAllocationsAreNotCached);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
MultipleThreadCaches);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, RecordStats);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
ThreadCacheRegistry);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
MultipleThreadCachesAccounting);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
DynamicCountPerBucket);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
DynamicCountPerBucketClamping);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
DynamicCountPerBucketMultipleThreads);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
DynamicSizeThreshold);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest,
DynamicSizeThresholdPurge);
PA_FRIEND_TEST_ALL_PREFIXES(PartitionAllocThreadCacheTest, ClearFromTail);
};
PA_ALWAYS_INLINE bool ThreadCache::MaybePutInCache(uintptr_t slot_start,
size_t bucket_index,
size_t* slot_size) {
PA_REENTRANCY_GUARD(is_in_thread_cache_);
PA_INCREMENT_COUNTER(stats_.cache_fill_count);
if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
PA_INCREMENT_COUNTER(stats_.cache_fill_misses);
return false;
}
auto& bucket = buckets_[bucket_index];
PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
PutInBucket(bucket, slot_start);
cached_memory_ += bucket.slot_size;
PA_INCREMENT_COUNTER(stats_.cache_fill_hits);
// Relaxed ordering: we don't care about having an up-to-date or consistent
// value, just want it to not change while we are using it, hence using
// relaxed ordering, and loading into a local variable. Without it, we are
// gambling that the compiler would not issue multiple loads.
uint8_t limit = bucket.limit.load(std::memory_order_relaxed);
// Batched deallocation, amortizing lock acquisitions.
if (PA_UNLIKELY(bucket.count > limit)) {
ClearBucket(bucket, limit / 2);
}
if (PA_UNLIKELY(should_purge_.load(std::memory_order_relaxed))) {
PurgeInternal();
}
*slot_size = bucket.slot_size;
return true;
}
PA_ALWAYS_INLINE uintptr_t ThreadCache::GetFromCache(size_t bucket_index,
size_t* slot_size) {
#if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
stats_.allocs_per_bucket_[bucket_index]++;
#endif
PA_REENTRANCY_GUARD(is_in_thread_cache_);
PA_INCREMENT_COUNTER(stats_.alloc_count);
// Only handle "small" allocations.
if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
PA_INCREMENT_COUNTER(stats_.alloc_miss_too_large);
PA_INCREMENT_COUNTER(stats_.alloc_misses);
return 0;
}
auto& bucket = buckets_[bucket_index];
if (PA_LIKELY(bucket.freelist_head)) {
PA_INCREMENT_COUNTER(stats_.alloc_hits);
} else {
PA_DCHECK(bucket.count == 0);
PA_INCREMENT_COUNTER(stats_.alloc_miss_empty);
PA_INCREMENT_COUNTER(stats_.alloc_misses);
FillBucket(bucket_index);
// Very unlikely, means that the central allocator is out of memory. Let it
// deal with it (may return 0, may crash).
if (PA_UNLIKELY(!bucket.freelist_head)) {
return 0;
}
}
PA_DCHECK(bucket.count != 0);
internal::EncodedNextFreelistEntry* entry = bucket.freelist_head;
// TODO(lizeb): Consider removing once crbug.com/1382658 is fixed.
#if BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) && \
BUILDFLAG(HAS_64_BIT_POINTERS)
// x86_64 architecture now supports 57 bits of address space, as of Ice Lake
// for Intel. However Chrome OS systems do not ship with kernel support for
// it, but with 48 bits, so all canonical addresses have the upper 16 bits
// zeroed (17 in practice, since the upper half of address space is reserved
// by the kernel).
constexpr uintptr_t kCanonicalPointerMask = (1ULL << 48) - 1;
PA_CHECK(!(reinterpret_cast<uintptr_t>(entry) & ~kCanonicalPointerMask));
#endif // BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) &&
// BUILDFLAG(HAS_64_BIT_POINTERS)
// Passes the bucket size to |GetNext()|, so that in case of freelist
// corruption, we know the bucket size that lead to the crash, helping to
// narrow down the search for culprit. |bucket| was touched just now, so this
// does not introduce another cache miss.
internal::EncodedNextFreelistEntry* next =
entry->GetNextForThreadCache<true>(bucket.slot_size);
PA_DCHECK(entry != next);
bucket.count--;
PA_DCHECK(bucket.count != 0 || !next);
bucket.freelist_head = next;
*slot_size = bucket.slot_size;
PA_DCHECK(cached_memory_ >= bucket.slot_size);
cached_memory_ -= bucket.slot_size;
return internal::SlotStartPtr2Addr(entry);
}
PA_ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket,
uintptr_t slot_start) {
#if PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) && \
BUILDFLAG(HAS_64_BIT_POINTERS)
// We see freelist corruption crashes happening in the wild. These are likely
// due to out-of-bounds accesses in the previous slot, or to a Use-After-Free
// somewhere in the code.
//
// The issue is that we detect the UaF far away from the place where it
// happens. As a consequence, we should try to make incorrect code crash as
// early as possible. Poisoning memory at free() time works for UaF, but it
// was seen in the past to incur a high performance cost.
//
// Here, only poison the current cacheline, which we are touching anyway.
// TODO(lizeb): Make sure this does not hurt performance.
// Everything below requires this alignment.
static_assert(internal::kAlignment == 16, "");
// The pointer is always 16 bytes aligned, so its start address is always == 0
// % 16. Its distance to the next cacheline is
// `64 - ((slot_start & 63) / 16) * 16`
static_assert(
internal::kPartitionCachelineSize == 64,
"The computation below assumes that cache lines are 64 bytes long.");
int distance_to_next_cacheline_in_16_bytes = 4 - ((slot_start >> 4) & 3);
int slot_size_remaining_in_16_bytes =
#if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
// When BRP is on in the "previous slot" mode, this slot may have a BRP
// ref-count of the next, potentially allocated slot. Make sure we don't
// overwrite it.
(bucket.slot_size - sizeof(PartitionRefCount)) / 16;
#else
bucket.slot_size / 16;
#endif // BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
slot_size_remaining_in_16_bytes = std::min(
slot_size_remaining_in_16_bytes, distance_to_next_cacheline_in_16_bytes);
static const uint32_t poison_16_bytes[4] = {0xbadbad00, 0xbadbad00,
0xbadbad00, 0xbadbad00};
// Give a hint to the compiler in hope it'll vectorize the loop.
#if PA_HAS_BUILTIN(__builtin_assume_aligned)
void* slot_start_tagged = __builtin_assume_aligned(
internal::SlotStartAddr2Ptr(slot_start), internal::kAlignment);
#else
void* slot_start_tagged = internal::SlotStartAddr2Ptr(slot_start);
#endif
uint32_t* address_aligned = static_cast<uint32_t*>(slot_start_tagged);
for (int i = 0; i < slot_size_remaining_in_16_bytes; i++) {
// Clang will expand the memcpy to a 16-byte write (movups on x86).
memcpy(address_aligned, poison_16_bytes, sizeof(poison_16_bytes));
address_aligned += 4;
}
#endif // PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) &&
// BUILDFLAG(HAS_64_BIT_POINTERS)
auto* entry =
internal::EncodedNextFreelistEntry::EmplaceAndInitForThreadCache(
slot_start, bucket.freelist_head);
bucket.freelist_head = entry;
bucket.count++;
}
PA_ALWAYS_INLINE void ThreadCache::RecordAllocation(size_t size) {
thread_alloc_stats_.alloc_count++;
thread_alloc_stats_.alloc_total_size += size;
}
PA_ALWAYS_INLINE void ThreadCache::RecordDeallocation(size_t size) {
thread_alloc_stats_.dealloc_count++;
thread_alloc_stats_.dealloc_total_size += size;
}
} // namespace partition_alloc
#endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_