[OpenMP] Add absolute KMP_HW_SUBSET functionality (#85326)

Jonathan Peyton · web-flow · commit 2ff3850ea19f · 2024-04-03T11:43:23.000-05:00
Users can put a : in front of KMP_HW_SUBSET to indicate that the
specified subset is an "absolute" subset. Currently, when a user puts
KMP_HW_SUBSET=1t. This gets translated to KMP_HW_SUBSET="*s,*c,1t",
where * means "use all of". If a user wants only one thread as the
entire topology they can now do KMP_HW_SUBSET=:1t.

Along with the absolute syntax is a fix for newer machines and making
them easier to use with only the 3-level topology syntax. When a user
puts KMP_HW_SUBSET=1s,4c,2t on a machine which actually has 4 layers,
(say 1s,2m,3c,2t as the entire machine) the user gets an unexpected "too
many resources asked" message because KMP_HW_SUBSET currently translates
the "4c" value to mean 4 cores per module. To help users out, the
runtime can assume that these newer layers, module in this case, should
be ignored if they are not specified, but the topology should always
take into account the sockets, cores, and threads layers.
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
@@ -496,7 +496,9 @@ An extended syntax is available when ``KMP_TOPOLOGY_METHOD=hwloc``. Depending on
 resources are detected, you may be able to specify additional resources, such as
 NUMA domains and groups of hardware resources that share certain cache levels.
 
-**Basic syntax:** ``[num_units|*]ID[@offset][:attribute] [,[num_units|*]ID[@offset][:attribute]...]``
+**Basic syntax:** ``[:][num_units|*]ID[@offset][:attribute] [,[num_units|*]ID[@offset][:attribute]...]``
+
+An optional colon (:) can be specified at the beginning of the syntax to specify an explicit hardware subset. The default is an implicit hardware subset.
 
 Supported unit IDs are not case-insensitive.
 
@@ -547,6 +549,18 @@ When any numa or tile units are specified in ``KMP_HW_SUBSET`` and the hwloc
 topology method is available, the ``KMP_TOPOLOGY_METHOD`` will be automatically
 set to hwloc, so there is no need to set it explicitly.
 
+For an **explicit hardware subset**, if one or more topology layers detected by the
+runtime are omitted from the subset, then those topology layers are ignored.
+Only explicitly specified topology layers are used in the subset.
+
+For an **implicit hardware subset**, it is implied that the socket, core, and thread
+topology types should be included in the subset. Other topology layers are not
+implicitly included and are ignored if they are not specified in the subset.
+Because the socket, core and thread topology types are always included in
+implicit hardware subsets, when they are omitted, it is assumed that all
+available resources of that type should be used. Implicit hardware subsets are
+the default.
+
 If you don't specify one or more types of resource, such as socket or thread,
 all available resources of that type are used.
 
@@ -565,7 +579,7 @@ This variable does not work if ``KMP_AFFINITY=disabled``.
 **Default:** If omitted, the default value is to use all the
 available hardware resources.
 
-**Examples:**
+**Implicit Hardware Subset Examples:**
 
 * ``2s,4c,2t``: Use the first 2 sockets (s0 and s1), the first 4 cores on each
   socket (c0 - c3), and 2 threads per core.
@@ -590,6 +604,12 @@ available hardware resources.
 * ``*c:eff1@3``: Use all available sockets, skip the first three cores of
   efficiency 1, and then use the rest of the available cores of efficiency 1.
 
+Explicit Hardware Subset Examples:
+
+* ``:2s,6t`` Use exactly the first two sockets and 6 threads per socket.
+* ``:1t@7`` Skip the first 7 threads (t0-t6) and use exactly one thread (t7).
+* ``:5c,1t`` Use exactly the first 5 cores (c0-c4) and the first thread on each core.
+
 To see the result of the setting, you can specify ``verbose`` modifier in
 ``KMP_AFFINITY`` environment variable. The OpenMP run-time library will output
 to ``stderr`` the information about the discovered hardware topology before and
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
@@ -987,41 +987,6 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
   _discover_uniformity();
 }
 
-// Represents running sub IDs for a single core attribute where
-// attribute values have SIZE possibilities.
-template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
-  int last_level; // last level in topology to consider for sub_ids
-  int sub_id[SIZE]; // The sub ID for a given attribute value
-  int prev_sub_id[KMP_HW_LAST];
-  IndexFunc indexer;
-
-public:
-  kmp_sub_ids_t(int last_level) : last_level(last_level) {
-    KMP_ASSERT(last_level < KMP_HW_LAST);
-    for (size_t i = 0; i < SIZE; ++i)
-      sub_id[i] = -1;
-    for (size_t i = 0; i < KMP_HW_LAST; ++i)
-      prev_sub_id[i] = -1;
-  }
-  void update(const kmp_hw_thread_t &hw_thread) {
-    int idx = indexer(hw_thread);
-    KMP_ASSERT(idx < (int)SIZE);
-    for (int level = 0; level <= last_level; ++level) {
-      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
-        if (level < last_level)
-          sub_id[idx] = -1;
-        sub_id[idx]++;
-        break;
-      }
-    }
-    for (int level = 0; level <= last_level; ++level)
-      prev_sub_id[level] = hw_thread.sub_ids[level];
-  }
-  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
-    return sub_id[indexer(hw_thread)];
-  }
-};
-
 #if KMP_AFFINITY_SUPPORTED
 static kmp_str_buf_t *
 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
@@ -1084,9 +1049,12 @@ bool kmp_topology_t::filter_hw_subset() {
   // First, sort the KMP_HW_SUBSET items by the machine topology
   __kmp_hw_subset->sort();
 
+  __kmp_hw_subset->canonicalize(__kmp_topology);
+
   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
   bool using_core_types = false;
   bool using_core_effs = false;
+  bool is_absolute = __kmp_hw_subset->is_absolute();
   int hw_subset_depth = __kmp_hw_subset->get_depth();
   kmp_hw_t specified[KMP_HW_LAST];
   int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
@@ -1124,12 +1092,14 @@ bool kmp_topology_t::filter_hw_subset() {
 
     // Check to see if each layer's num & offset parameters are valid
     max_count = get_ratio(level);
-    if (max_count < 0 ||
-        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
-      bool plural = (num > 1);
-      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
-                      __kmp_hw_get_catalog_string(type, plural));
-      return false;
+    if (!is_absolute) {
+      if (max_count < 0 ||
+          (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+        bool plural = (num > 1);
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+                        __kmp_hw_get_catalog_string(type, plural));
+        return false;
+      }
     }
 
     // Check to see if core attributes are consistent
@@ -1192,7 +1162,7 @@ bool kmp_topology_t::filter_hw_subset() {
       }
 
       // Check that the number of requested cores with attributes is valid
-      if (using_core_types || using_core_effs) {
+      if ((using_core_types || using_core_effs) && !is_absolute) {
         for (int j = 0; j < item.num_attrs; ++j) {
           int num = item.num[j];
           int offset = item.offset[j];
@@ -1248,46 +1218,92 @@ bool kmp_topology_t::filter_hw_subset() {
     }
   }
 
-  struct core_type_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      switch (t.attrs.get_core_type()) {
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-      case KMP_HW_MAX_NUM_CORE_TYPES:
-        return 0;
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-      case KMP_HW_CORE_TYPE_ATOM:
-        return 1;
-      case KMP_HW_CORE_TYPE_CORE:
-        return 2;
-#endif
-      }
-      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
-      KMP_BUILTIN_UNREACHABLE;
+  // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
+  // or core attributes (core type or efficiency)
+  int prev_sub_ids[KMP_HW_LAST];
+  int abs_sub_ids[KMP_HW_LAST];
+  int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
+  int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
+  for (size_t i = 0; i < KMP_HW_LAST; ++i) {
+    abs_sub_ids[i] = -1;
+    prev_sub_ids[i] = -1;
+  }
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
+    core_eff_sub_ids[i] = -1;
+  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    core_type_sub_ids[i] = -1;
+
+  // Determine which hardware threads should be filtered.
+
+  // Helpful to determine if a topology layer is targeted by an absolute subset
+  auto is_targeted = [&](int level) {
+    if (is_absolute) {
+      for (int i = 0; i < hw_subset_depth; ++i)
+        if (topology_levels[i] == level)
+          return true;
+      return false;
     }
+    // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
+    return true;
   };
-  struct core_eff_indexer {
-    int operator()(const kmp_hw_thread_t &t) const {
-      return t.attrs.get_core_eff();
+
+  // Helpful to index into core type sub Ids array
+  auto get_core_type_index = [](const kmp_hw_thread_t &t) {
+    switch (t.attrs.get_core_type()) {
+    case KMP_HW_CORE_TYPE_UNKNOWN:
+    case KMP_HW_MAX_NUM_CORE_TYPES:
+      return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    case KMP_HW_CORE_TYPE_ATOM:
+      return 1;
+    case KMP_HW_CORE_TYPE_CORE:
+      return 2;
+#endif
     }
+    KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+    KMP_BUILTIN_UNREACHABLE;
   };
 
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
-      core_level);
-  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
-      core_level);
+  // Helpful to index into core efficiencies sub Ids array
+  auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
+    return t.attrs.get_core_eff();
+  };
 
-  // Determine which hardware threads should be filtered.
   int num_filtered = 0;
   kmp_affin_mask_t *filtered_mask;
   KMP_CPU_ALLOC(filtered_mask);
   KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
   for (int i = 0; i < num_hw_threads; ++i) {
     kmp_hw_thread_t &hw_thread = hw_threads[i];
-    // Update type_sub_id
-    if (using_core_types)
-      core_type_sub_ids.update(hw_thread);
-    if (using_core_effs)
-      core_eff_sub_ids.update(hw_thread);
+
+    // Figure out the absolute sub ids and core eff/type sub ids
+    if (is_absolute || using_core_effs || using_core_types) {
+      for (int level = 0; level < get_depth(); ++level) {
+        if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
+          bool found_targeted = false;
+          for (int j = level; j < get_depth(); ++j) {
+            bool targeted = is_targeted(j);
+            if (!found_targeted && targeted) {
+              found_targeted = true;
+              abs_sub_ids[j]++;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)]++;
+            } else if (targeted) {
+              abs_sub_ids[j] = 0;
+              if (j == core_level && using_core_effs)
+                core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
+              if (j == core_level && using_core_types)
+                core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
+            }
+          }
+          break;
+        }
+      }
+      for (int level = 0; level < get_depth(); ++level)
+        prev_sub_ids[level] = hw_thread.sub_ids[level];
+    }
 
     // Check to see if this hardware thread should be filtered
     bool should_be_filtered = false;
@@ -1322,20 +1338,24 @@ bool kmp_topology_t::filter_hw_subset() {
         int num = hw_subset_item.num[attr_idx];
         int offset = hw_subset_item.offset[attr_idx];
         if (using_core_types)
-          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
         else
-          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+          sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
         if (sub_id < offset ||
             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
       } else {
+        int sub_id;
         int num = hw_subset_item.num[0];
         int offset = hw_subset_item.offset[0];
-        if (hw_thread.sub_ids[level] < offset ||
-            (num != kmp_hw_subset_t::USE_ALL &&
-             hw_thread.sub_ids[level] >= offset + num)) {
+        if (is_absolute)
+          sub_id = abs_sub_ids[level];
+        else
+          sub_id = hw_thread.sub_ids[level];
+        if (sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
         }
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
@@ -1172,6 +1172,50 @@ class kmp_hw_subset_t {
     qsort(items, depth, sizeof(item_t), hw_subset_compare);
   }
   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+
+  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
+  // This means putting each of {sockets, cores, threads} in the topology if
+  // they are not specified:
+  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
+  // e.g., 3module => *s,3module,*c,*t
+  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
+  // are expecting the traditional sockets/cores/threads topology. For newer
+  // hardware, there can be intervening layers like dies/tiles/modules
+  // (usually corresponding to a cache level). So when a user asks for
+  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
+  // should get 12 hardware threads across 6 cores and effectively ignore the
+  // module layer.
+  void canonicalize(const kmp_topology_t *top) {
+    // Layers to target for KMP_HW_SUBSET canonicalization
+    kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
+
+    // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
+    if (is_absolute())
+      return;
+
+    // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
+    // topology doesn't have these layers
+    for (kmp_hw_t type : targeted)
+      if (top->get_level(type) == KMP_HW_UNKNOWN)
+        return;
+
+    // Put targeted layers in topology if they do not exist
+    for (kmp_hw_t type : targeted) {
+      bool found = false;
+      for (int i = 0; i < get_depth(); ++i) {
+        if (top->get_equivalent_type(items[i].type) == type) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
+      }
+    }
+    sort();
+    // Set as an absolute topology that only targets the targeted layers
+    set_absolute();
+  }
   void dump() const {
     printf("**********************\n");
     printf("*** kmp_hw_subset: ***\n");
diff --git a/openmp/runtime/test/affinity/kmp-abs-hw-subset.c b/openmp/runtime/test/affinity/kmp-abs-hw-subset.c