Merge commit 'cc0cf2d04c39c7571fe0194a8172af37fcd69a7e'

whitneywhtsang · whitneywhtsang · commit 5623ad7d28c4 · 2024-10-10T19:41:47.000Z
diff --git a/.github/workflows/llvm-build.yml b/.github/workflows/llvm-build.yml
@@ -107,7 +107,6 @@ jobs:
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
         -DLLVM_ENABLE_TERMINFO=OFF
-        -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
         llvm-project/llvm
 
         ninja -C llvm-project/build check-mlir install
@@ -131,7 +130,6 @@ jobs:
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
         -DLLVM_ENABLE_TERMINFO=OFF
-        -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
         llvm-project/llvm
 
         ninja -C llvm-project/build check-mlir install
diff --git a/.github/workflows/llvm-build/almalinux.Dockerfile b/.github/workflows/llvm-build/almalinux.Dockerfile
@@ -33,7 +33,6 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_ENABLE_PROJECTS=mlir \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
-  -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF \
   -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
   /source/llvm-project/llvm
 
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -58,7 +58,7 @@ def hash(self):
     "ptx": ptx_prototype_pattern,
 }
 
-mlir_arg_type_pattern = r'%\w+: ((?:[^,\s<)]+|<[^>]+>)+),?'
+mlir_arg_type_pattern = r'%\w+: ((?:[^,\s<)]+|<[^>]+>)+(?: {[^}]+})?),?'
 ptx_arg_type_pattern = r"\.param\s+\.(\w+)"
 arg_type_pattern = {
     "ttir": mlir_arg_type_pattern,
@@ -71,6 +71,10 @@ def convert_type_repr(x):
     # Currently we only capture the pointer type and assume the pointer is on global memory.
     # TODO: Capture and support shared memory space
     match = re.search(r'!tt\.ptr<([^,]+)', x)
+    tma = re.search(r'tt.nv_tma_desc = 1', x)
+    if tma is not None:
+        return 'nvTmaDesc'
+    x = re.sub(r' {[^}]+}', '', x)
     if match is not None:
         return '*' + convert_type_repr(match.group(1))
     return x
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -359,7 +359,7 @@ def make_cubin(src, metadata, opt, capability):
 
                 raise RuntimeError(f'{error}\n'
                                    f'`ptxas` stderr:\n{log}\n'
-                                   f'Repro command: {ptxas_cmd}\n')
+                                   f'Repro command: {" ".join(ptxas_cmd)}\n')
 
             with open(fbin, 'rb') as f:
                 cubin = f.read()
diff --git a/third_party/proton/README.md b/third_party/proton/README.md
@@ -161,7 +161,6 @@ The following example demonstrates how to use instruction sampling:
 ```python
 import triton.profiler as proton
 
-
 proton.start(name="profile_name", context="shadow", backend="cupti_pcsampling")
 ```
 
diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py
@@ -104,8 +104,6 @@ def get_min_time_bytes(df, device_info):
 
 def derive_metrics(gf, metrics, raw_metrics, device_info):
     derived_metrics = []
-    original_metrics = []
-    exclusive_metrics = ["util"] + list(derivable_metrics.keys()) + list(avg_time_factor_dict.factor.keys())
     internal_frame_indices = gf.dataframe["device_id"].isna()
 
     def get_time_seconds(df):
@@ -121,10 +119,10 @@ def get_time_seconds(df):
             gf.dataframe["util (inc)"] = min_time_flops["min_time"].combine(min_time_bytes["min_time"], max) / time_sec
             gf.dataframe.loc[internal_frame_indices, "util (inc)"] = np.nan
             derived_metrics.append("util (inc)")
-        elif metric in derivable_metrics:
-            deriveable_metric = derivable_metrics[metric]
-            metric_name = deriveable_metric.name
-            metric_factor_dict = deriveable_metric.factor
+        elif metric in derivable_metrics:  # flop<width>/s, <t/g>byte/s
+            derivable_metric = derivable_metrics[metric]
+            metric_name = derivable_metric.name
+            metric_factor_dict = derivable_metric.factor
             matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
             gf.dataframe[f"{metric} (inc)"] = (gf.dataframe[matched_metric_name] / (get_time_seconds(gf.dataframe)) /
                                                metric_factor_dict[metric])
@@ -134,24 +132,28 @@ def get_time_seconds(df):
             gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) /
                                                time_factor_dict.factor[metric_time_unit])
             derived_metrics.append(f"{metric} (inc)")
-            metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0]
         elif metric in avg_time_factor_dict.factor:
             metric_time_unit = avg_time_factor_dict.name + "/" + metric.split("/")[1]
             gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['count'] /
                                                avg_time_factor_dict.factor[metric_time_unit])
             gf.dataframe.loc[internal_frame_indices, f"{metric} (inc)"] = np.nan
             derived_metrics.append(f"{metric} (inc)")
         else:
-            original_metrics.append(metric)
-        if metric not in exclusive_metrics:
-            single_frame = gf.dataframe[metric_name]
-            total = gf.dataframe[metric_name].iloc[0]
-            metric = metric.split("/")[0]
-            gf.dataframe[f"{metric}/% (inc)"] = (single_frame / total) * 100.0
-            derived_metrics.append(f"{metric}/% (inc)")
-    if original_metrics:
-        original_metrics = match_available_metrics(original_metrics, raw_metrics)
-    return derived_metrics + original_metrics
+            metric_name_and_unit = metric.split("/")
+            metric_name = metric_name_and_unit[0]
+            if len(metric_name_and_unit) > 1:
+                metric_unit = metric_name_and_unit[1]
+                if metric_unit != "%":
+                    raise ValueError(f"Unsupported unit {metric_unit}")
+                matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
+                single_frame = gf.dataframe[matched_metric_name]
+                total = gf.dataframe[matched_metric_name].iloc[0]
+                gf.dataframe[f"{metric_name}/% (inc)"] = (single_frame / total) * 100.0
+                derived_metrics.append(f"{metric_name}/% (inc)")
+            else:
+                matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
+                derived_metrics.append(matched_metric_name)
+    return derived_metrics
 
 
 def format_frames(gf, format):
@@ -234,10 +236,7 @@ def main():
 - flop[<8/16/32/64>]/s, gflop[<8/16/32/64>]/s, tflop[<8/16/32/64>]/s: flops / time
 - byte/s, gbyte/s, tbyte/s: bytes / time
 - util: max(sum(flops<width>) / peak_flops<width>_time, sum(bytes) / peak_bandwidth_time)
-
-For inclusive metrics (e.g. time) an additional column is printed showing the percentage
-each frame is of the full model.
-
+- <metric>/%%: frame(metric) / sum(metric). Only availble for inclusive metrics (e.g. time)
 """,
     )
     argparser.add_argument(