@@ -1164,50 +1164,58 @@ def apple_silicon_cache_size(cache_level: int) -> int:
11641164 return size .value
11651165
11661166
1167- def get_l3_cache_info () :
1167+ def get_cache_info ( cache_level : int ) -> tuple :
11681168 result = subprocess .run (["lscpu" , "--json" ], capture_output = True , text = True )
11691169 lscpu_info = json .loads (result .stdout )
1170+ if cache_level == 0 :
1171+ cache_level = "1d"
11701172 for entry in lscpu_info ["lscpu" ]:
1171- if entry ["field" ] == "L3 cache:" :
1173+ if entry ["field" ] == f"L { cache_level } cache:" :
11721174 size_str , instances_str = entry ["data" ].split (" (" )
1173- size = int (size_str .split ()[0 ]) * 1024 * 1024 # Convert MiB to bytes
1175+ size , units = size_str .split ()
1176+ size = int (size )
1177+ if units == "KiB" :
1178+ size *= 2 ** 10
1179+ elif units == "MiB" :
1180+ size *= 2 ** 20
1181+ elif units == "GiB" :
1182+ size *= 2 ** 30
1183+ else :
1184+ raise ValueError ("Unrecognized unit when guessing cache units" )
11741185 instances = int (instances_str .split ()[0 ])
11751186 return size , instances
11761187
1177- raise ValueError ("L3 cache not found in lscpu output" )
1188+ raise ValueError (f"L { cache_level } cache not found in lscpu output" )
11781189
11791190
11801191def linux_cache_size (cache_level : int , default_size : int ) -> int :
11811192 """Get the data cache_level size in bytes for Linux."""
11821193 cache_size = default_size
1183- if cache_level == 3 :
1184- # In modern multicore CPUs, the L3 cache is normally shared among all core complexes (CCX),
1185- # but sysfs only reports the cache size for each complex, so better use lscpu, if available.
1186- try :
1187- l3_cache_size , l3_cache_instances = get_l3_cache_info ()
1188- # What comes next is a heuristic to guess the most appropriate L3 cache size.
1189- # Essentially, this is the result of different experiments, mainly on AMD CPUs
1190- # (in particular, Ryzen 9800X3D with 8 cores, and EPYC 9454P with 48 cores).
1191- # For Intel, YMMV, but my guess is that they are not using the same CCX approach.
1192- l3_cache_size *= l3_cache_instances
1193- if l3_cache_instances > 1 :
1194- # This is yet another heuristic for large CPUs with core sets (CCX) having
1195- # their own L3. No idea why, but it seems to work well.
1196- l3_cache_size *= l3_cache_instances // 2
1197- return l3_cache_size
1198- except (FileNotFoundError , ValueError ):
1199- # If lscpu is not available or the cache size cannot be read, try with sysfs
1200- pass
12011194 try :
1195+ # Try to read the cache size from sysfs
12021196 with open (f"/sys/devices/system/cpu/cpu0/cache/index{ cache_level } /size" ) as f :
12031197 size = f .read ()
12041198 if size .endswith ("K\n " ):
1205- cache_size = int (size [:- 2 ]) * 1024
1199+ cache_size = int (size [:- 2 ]) * 2 ** 10
12061200 elif size .endswith ("M\n " ):
1207- cache_size = int (size [:- 2 ]) * 1024 * 1024
1201+ cache_size = int (size [:- 2 ]) * 2 ** 20
1202+ elif size .endswith ("G\n " ):
1203+ cache_size = int (size [:- 2 ]) * 2 ** 30
12081204 except FileNotFoundError :
1209- # If the cache size cannot be read, return the default size
1210- pass
1205+ # Try with lscpu, if available.
1206+ try :
1207+ cache_size , cache_instances = get_cache_info (cache_level )
1208+ # cache_instances typically refers to the number of sockets, CCXs or cores,
1209+ # depending on the CPU and cache level.
1210+ # In general, dividing the cache size by the number of instances would bring
1211+ # best performance for private caches (L1 and L2). For shared caches (L3),
1212+ # this should be the case as well, but more experimentation is needed.
1213+ cache_size //= cache_instances
1214+ return cache_size
1215+ except (FileNotFoundError , ValueError ):
1216+ # If lscpu is not available or the cache size cannot be read from sysfs,
1217+ # return the default size.
1218+ pass
12111219 return cache_size
12121220
12131221
0 commit comments