@@ -1164,50 +1164,58 @@ def apple_silicon_cache_size(cache_level: int) -> int:
1164
1164
return size .value
1165
1165
1166
1166
1167
- def get_l3_cache_info () :
1167
+ def get_cache_info ( cache_level : int ) -> tuple :
1168
1168
result = subprocess .run (["lscpu" , "--json" ], capture_output = True , text = True )
1169
1169
lscpu_info = json .loads (result .stdout )
1170
+ if cache_level == 0 :
1171
+ cache_level = "1d"
1170
1172
for entry in lscpu_info ["lscpu" ]:
1171
- if entry ["field" ] == "L3 cache:" :
1173
+ if entry ["field" ] == f"L { cache_level } cache:" :
1172
1174
size_str , instances_str = entry ["data" ].split (" (" )
1173
- size = int (size_str .split ()[0 ]) * 1024 * 1024 # Convert MiB to bytes
1175
+ size , units = size_str .split ()
1176
+ size = int (size )
1177
+ if units == "KiB" :
1178
+ size *= 2 ** 10
1179
+ elif units == "MiB" :
1180
+ size *= 2 ** 20
1181
+ elif units == "GiB" :
1182
+ size *= 2 ** 30
1183
+ else :
1184
+ raise ValueError ("Unrecognized unit when guessing cache units" )
1174
1185
instances = int (instances_str .split ()[0 ])
1175
1186
return size , instances
1176
1187
1177
- raise ValueError ("L3 cache not found in lscpu output" )
1188
+ raise ValueError (f"L { cache_level } cache not found in lscpu output" )
1178
1189
1179
1190
1180
1191
def linux_cache_size (cache_level : int , default_size : int ) -> int :
1181
1192
"""Get the data cache_level size in bytes for Linux."""
1182
1193
cache_size = default_size
1183
- if cache_level == 3 :
1184
- # In modern multicore CPUs, the L3 cache is normally shared among all core complexes (CCX),
1185
- # but sysfs only reports the cache size for each complex, so better use lscpu, if available.
1186
- try :
1187
- l3_cache_size , l3_cache_instances = get_l3_cache_info ()
1188
- # What comes next is a heuristic to guess the most appropriate L3 cache size.
1189
- # Essentially, this is the result of different experiments, mainly on AMD CPUs
1190
- # (in particular, Ryzen 9800X3D with 8 cores, and EPYC 9454P with 48 cores).
1191
- # For Intel, YMMV, but my guess is that they are not using the same CCX approach.
1192
- l3_cache_size *= l3_cache_instances
1193
- if l3_cache_instances > 1 :
1194
- # This is yet another heuristic for large CPUs with core sets (CCX) having
1195
- # their own L3. No idea why, but it seems to work well.
1196
- l3_cache_size *= l3_cache_instances // 2
1197
- return l3_cache_size
1198
- except (FileNotFoundError , ValueError ):
1199
- # If lscpu is not available or the cache size cannot be read, try with sysfs
1200
- pass
1201
1194
try :
1195
+ # Try to read the cache size from sysfs
1202
1196
with open (f"/sys/devices/system/cpu/cpu0/cache/index{ cache_level } /size" ) as f :
1203
1197
size = f .read ()
1204
1198
if size .endswith ("K\n " ):
1205
- cache_size = int (size [:- 2 ]) * 1024
1199
+ cache_size = int (size [:- 2 ]) * 2 ** 10
1206
1200
elif size .endswith ("M\n " ):
1207
- cache_size = int (size [:- 2 ]) * 1024 * 1024
1201
+ cache_size = int (size [:- 2 ]) * 2 ** 20
1202
+ elif size .endswith ("G\n " ):
1203
+ cache_size = int (size [:- 2 ]) * 2 ** 30
1208
1204
except FileNotFoundError :
1209
- # If the cache size cannot be read, return the default size
1210
- pass
1205
+ # Try with lscpu, if available.
1206
+ try :
1207
+ cache_size , cache_instances = get_cache_info (cache_level )
1208
+ # cache_instances typically refers to the number of sockets, CCXs or cores,
1209
+ # depending on the CPU and cache level.
1210
+ # In general, dividing the cache size by the number of instances would bring
1211
+ # best performance for private caches (L1 and L2). For shared caches (L3),
1212
+ # this should be the case as well, but more experimentation is needed.
1213
+ cache_size //= cache_instances
1214
+ return cache_size
1215
+ except (FileNotFoundError , ValueError ):
1216
+ # If lscpu is not available or the cache size cannot be read from sysfs,
1217
+ # return the default size.
1218
+ pass
1211
1219
return cache_size
1212
1220
1213
1221
0 commit comments