Skip to content

Commit 173a1d7

Browse files
ckennellycopybara-github
authored andcommitted
Improve efficacy of lazy per-CPU slab initialization.
Accessing portions of the slab for reading may cause the kernel to back the slab. Before accessing a per-CPU slab, use the out-of-band metadata of the CPUCache (populated) to confirm if we have already populated this slab or not. PiperOrigin-RevId: 318511754 Change-Id: I5a5477f1f449a00be0209d7809401f4db66bf8ce
1 parent 6b73ba7 commit 173a1d7

File tree

2 files changed

+46
-3
lines changed

2 files changed

+46
-3
lines changed

tcmalloc/cpu_cache.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@ int CPUCache::Overflow(void *ptr, size_t cl, int cpu) {
428428

429429
uint64_t CPUCache::UsedBytes(int target_cpu) const {
430430
ASSERT(target_cpu >= 0);
431+
if (!HasPopulated(target_cpu)) {
432+
return 0;
433+
}
434+
431435
uint64_t total = 0;
432436
for (int cl = 1; cl < kNumClasses; cl++) {
433437
int size = Static::sizemap()->class_to_size(cl);
@@ -458,7 +462,10 @@ uint64_t CPUCache::TotalObjectsOfClass(size_t cl) const {
458462
ASSERT(cl < kNumClasses);
459463
uint64_t total_objects = 0;
460464
if (cl > 0) {
461-
for (int cpu = 0; cpu < absl::base_internal::NumCPUs(); cpu++) {
465+
for (int cpu = 0, n = absl::base_internal::NumCPUs(); cpu < n; cpu++) {
466+
if (!HasPopulated(cpu)) {
467+
continue;
468+
}
462469
total_objects += freelist_.Length(cpu, cl);
463470
}
464471
}

tcmalloc/cpu_cache_test.cc

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ TEST(CpuCacheTest, Metadata) {
5858

5959
EXPECT_EQ(0, count_cores());
6060

61+
int allowed_cpu_id;
6162
const size_t kSizeClass = 3;
63+
const size_t num_to_move = Static::sizemap()->num_objects_to_move(kSizeClass);
6264
void* ptr;
6365
{
6466
// Restrict this thread to a single core while allocating and processing the
@@ -67,8 +69,8 @@ TEST(CpuCacheTest, Metadata) {
6769
// TODO(b/151313823): Without this restriction, we may access--for reading
6870
// only--other slabs if we end up being migrated. These may cause huge
6971
// pages to be faulted for those cores, leading to test flakiness.
70-
tcmalloc_internal::ScopedAffinityMask mask(
71-
tcmalloc_internal::AllowedCpus()[0]);
72+
allowed_cpu_id = tcmalloc_internal::AllowedCpus()[0];
73+
tcmalloc_internal::ScopedAffinityMask mask(allowed_cpu_id);
7274

7375
ptr = cache.Allocate<OOMHandler>(kSizeClass);
7476

@@ -112,6 +114,40 @@ TEST(CpuCacheTest, Metadata) {
112114
ASSUME(false);
113115
break;
114116
};
117+
118+
// Read stats from the CPU caches. This should not impact resident_size.
119+
const size_t max_cpu_cache_size = Parameters::max_per_cpu_cache_size();
120+
size_t total_used_bytes = 0;
121+
for (int cpu = 0; cpu < num_cpus; ++cpu) {
122+
size_t used_bytes = cache.UsedBytes(cpu);
123+
total_used_bytes += used_bytes;
124+
125+
if (cpu == allowed_cpu_id) {
126+
EXPECT_GT(used_bytes, 0);
127+
EXPECT_TRUE(cache.HasPopulated(cpu));
128+
} else {
129+
EXPECT_EQ(used_bytes, 0);
130+
EXPECT_FALSE(cache.HasPopulated(cpu));
131+
}
132+
133+
EXPECT_LE(cache.Unallocated(cpu), max_cpu_cache_size);
134+
}
135+
136+
for (int cl = 0; cl < kNumClasses; ++cl) {
137+
// This is sensitive to the current growth policies of CPUCache. It may
138+
// require updating from time-to-time.
139+
EXPECT_EQ(cache.TotalObjectsOfClass(cl),
140+
(cl == kSizeClass ? num_to_move - 1 : 0))
141+
<< cl;
142+
}
143+
EXPECT_EQ(cache.TotalUsedBytes(), total_used_bytes);
144+
145+
PerCPUMetadataState post_stats = cache.MetadataMemoryUsage();
146+
// Confirm stats are within expected bounds.
147+
EXPECT_GT(post_stats.resident_size, 0);
148+
EXPECT_LE(post_stats.resident_size, upper_bound) << count_cores();
149+
// Confirm stats are unchanged.
150+
EXPECT_EQ(r.resident_size, post_stats.resident_size);
115151
} else {
116152
EXPECT_EQ(r.resident_size, r.virtual_size);
117153
}

0 commit comments

Comments
 (0)