jl_cpu_threads: exclude big.LITTLE efficency cores (#42099)

staticfloat · web-flow · commit d0d895e92e22 · 2021-09-06T21:44:39.000-07:00
* jl_cpu_threads: exclude big.LITTLE efficency cores On big.LITTLE systems, we generally only want to spawn as many threads/tasks as there are performance cores. By default, we want to leave the efficiency cores alone, as they may end up choking on the heavy workloads we are likely to schedule. Even something as simple as starting `julia` and initializing OpenBLAS on each thread can cause a system-wide latency spike as the efficiency cores struggle to chew through the momentary workload. To fix this, we attempt to identify when we are running on a big.LITTLE system (the only one currently widely supported is the Apple M1), and we subtract out the known number of efficiency cores. Once macOS 12 is released, we will be able to use the official API for enumerating the perflevels of the available cores, demonstrated in this PR to pytorch's cpuinfo repository [0]. [0] https://github.com/pytorch/cpuinfo/blob/8ab2db2d405436f1014ed603021545b3b6b6f1ae/src/arm/mach/init.c#L161-L163 * whitespace
diff --git a/src/sys.c b/src/sys.c
@@ -587,6 +587,15 @@ typedef DWORD (WINAPI *GAPC)(WORD);
 #endif
 #endif
 
+// Apple's M1 processor is a big.LITTLE style processor, with 4x "performance"
+// cores, and 4x "efficiency" cores.  Because Julia expects to be able to run
+// things like heavy linear algebra workloads on all cores, it's best for us
+// to only spawn as many threads as there are performance cores.  Once macOS
+// 12 is released, we'll be able to query the multiple "perf levels" of the
+// cores of a CPU (see this PR [0] to pytorch/cpuinfo for an example) but
+// until it's released, we will just recognize the M1 by its CPU family
+// identifier, then subtract how many efficiency cores we know it has.
+
 JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT
 {
 #if defined(HW_AVAILCPU) && defined(HW_NCPU)
@@ -599,6 +608,19 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT
         sysctl(nm, 2, &count, &len, NULL, 0);
         if (count < 1) { count = 1; }
     }
+
+#if defined(__APPLE__) && defined(_CPU_AARCH64_)
+    // Manually subtract efficiency cores for Apple's big.LITTLE cores
+    int32_t family = 0;
+    len = 4;
+    sysctlbyname("hw.cpufamily", &family, &len, NULL, 0);
+    if (family >= 1 && count > 1) {
+        if (family == CPUFAMILY_ARM_FIRESTORM_ICESTORM) {
+            // We know the Apple M1 has 4 efficiency cores, so subtract them out.
+            count -= 4;
+        }
+    }
+#endif
     return count;
 #elif defined(_SC_NPROCESSORS_ONLN)
     long count = sysconf(_SC_NPROCESSORS_ONLN);