Skip to content

Commit 5623ad7

Browse files
Merge commit 'cc0cf2d04c39c7571fe0194a8172af37fcd69a7e'
2 parents cf98f3a + cc0cf2d commit 5623ad7

File tree

6 files changed

+26
-27
lines changed

6 files changed

+26
-27
lines changed

.github/workflows/llvm-build.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ jobs:
107107
-DLLVM_INSTALL_UTILS=ON
108108
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
109109
-DLLVM_ENABLE_TERMINFO=OFF
110-
-DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
111110
llvm-project/llvm
112111
113112
ninja -C llvm-project/build check-mlir install
@@ -131,7 +130,6 @@ jobs:
131130
-DLLVM_INSTALL_UTILS=ON
132131
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
133132
-DLLVM_ENABLE_TERMINFO=OFF
134-
-DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
135133
llvm-project/llvm
136134
137135
ninja -C llvm-project/build check-mlir install

.github/workflows/llvm-build/almalinux.Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ RUN cmake -GNinja -Bbuild \
3333
-DLLVM_ENABLE_PROJECTS=mlir \
3434
-DLLVM_ENABLE_TERMINFO=OFF \
3535
-DLLVM_INSTALL_UTILS=ON \
36-
-DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF \
3736
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
3837
/source/llvm-project/llvm
3938

python/triton/compiler/compiler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def hash(self):
5858
"ptx": ptx_prototype_pattern,
5959
}
6060

61-
mlir_arg_type_pattern = r'%\w+: ((?:[^,\s<)]+|<[^>]+>)+),?'
61+
mlir_arg_type_pattern = r'%\w+: ((?:[^,\s<)]+|<[^>]+>)+(?: {[^}]+})?),?'
6262
ptx_arg_type_pattern = r"\.param\s+\.(\w+)"
6363
arg_type_pattern = {
6464
"ttir": mlir_arg_type_pattern,
@@ -71,6 +71,10 @@ def convert_type_repr(x):
7171
# Currently we only capture the pointer type and assume the pointer is on global memory.
7272
# TODO: Capture and support shared memory space
7373
match = re.search(r'!tt\.ptr<([^,]+)', x)
74+
tma = re.search(r'tt.nv_tma_desc = 1', x)
75+
if tma is not None:
76+
return 'nvTmaDesc'
77+
x = re.sub(r' {[^}]+}', '', x)
7478
if match is not None:
7579
return '*' + convert_type_repr(match.group(1))
7680
return x

third_party/nvidia/backend/compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ def make_cubin(src, metadata, opt, capability):
359359

360360
raise RuntimeError(f'{error}\n'
361361
f'`ptxas` stderr:\n{log}\n'
362-
f'Repro command: {ptxas_cmd}\n')
362+
f'Repro command: {" ".join(ptxas_cmd)}\n')
363363

364364
with open(fbin, 'rb') as f:
365365
cubin = f.read()

third_party/proton/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ The following example demonstrates how to use instruction sampling:
161161
```python
162162
import triton.profiler as proton
163163

164-
165164
proton.start(name="profile_name", context="shadow", backend="cupti_pcsampling")
166165
```
167166

third_party/proton/proton/viewer.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ def get_min_time_bytes(df, device_info):
104104

105105
def derive_metrics(gf, metrics, raw_metrics, device_info):
106106
derived_metrics = []
107-
original_metrics = []
108-
exclusive_metrics = ["util"] + list(derivable_metrics.keys()) + list(avg_time_factor_dict.factor.keys())
109107
internal_frame_indices = gf.dataframe["device_id"].isna()
110108

111109
def get_time_seconds(df):
@@ -121,10 +119,10 @@ def get_time_seconds(df):
121119
gf.dataframe["util (inc)"] = min_time_flops["min_time"].combine(min_time_bytes["min_time"], max) / time_sec
122120
gf.dataframe.loc[internal_frame_indices, "util (inc)"] = np.nan
123121
derived_metrics.append("util (inc)")
124-
elif metric in derivable_metrics:
125-
deriveable_metric = derivable_metrics[metric]
126-
metric_name = deriveable_metric.name
127-
metric_factor_dict = deriveable_metric.factor
122+
elif metric in derivable_metrics: # flop<width>/s, <t/g>byte/s
123+
derivable_metric = derivable_metrics[metric]
124+
metric_name = derivable_metric.name
125+
metric_factor_dict = derivable_metric.factor
128126
matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
129127
gf.dataframe[f"{metric} (inc)"] = (gf.dataframe[matched_metric_name] / (get_time_seconds(gf.dataframe)) /
130128
metric_factor_dict[metric])
@@ -134,24 +132,28 @@ def get_time_seconds(df):
134132
gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) /
135133
time_factor_dict.factor[metric_time_unit])
136134
derived_metrics.append(f"{metric} (inc)")
137-
metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0]
138135
elif metric in avg_time_factor_dict.factor:
139136
metric_time_unit = avg_time_factor_dict.name + "/" + metric.split("/")[1]
140137
gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['count'] /
141138
avg_time_factor_dict.factor[metric_time_unit])
142139
gf.dataframe.loc[internal_frame_indices, f"{metric} (inc)"] = np.nan
143140
derived_metrics.append(f"{metric} (inc)")
144141
else:
145-
original_metrics.append(metric)
146-
if metric not in exclusive_metrics:
147-
single_frame = gf.dataframe[metric_name]
148-
total = gf.dataframe[metric_name].iloc[0]
149-
metric = metric.split("/")[0]
150-
gf.dataframe[f"{metric}/% (inc)"] = (single_frame / total) * 100.0
151-
derived_metrics.append(f"{metric}/% (inc)")
152-
if original_metrics:
153-
original_metrics = match_available_metrics(original_metrics, raw_metrics)
154-
return derived_metrics + original_metrics
142+
metric_name_and_unit = metric.split("/")
143+
metric_name = metric_name_and_unit[0]
144+
if len(metric_name_and_unit) > 1:
145+
metric_unit = metric_name_and_unit[1]
146+
if metric_unit != "%":
147+
raise ValueError(f"Unsupported unit {metric_unit}")
148+
matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
149+
single_frame = gf.dataframe[matched_metric_name]
150+
total = gf.dataframe[matched_metric_name].iloc[0]
151+
gf.dataframe[f"{metric_name}/% (inc)"] = (single_frame / total) * 100.0
152+
derived_metrics.append(f"{metric_name}/% (inc)")
153+
else:
154+
matched_metric_name = match_available_metrics([metric_name], raw_metrics)[0]
155+
derived_metrics.append(matched_metric_name)
156+
return derived_metrics
155157

156158

157159
def format_frames(gf, format):
@@ -234,10 +236,7 @@ def main():
234236
- flop[<8/16/32/64>]/s, gflop[<8/16/32/64>]/s, tflop[<8/16/32/64>]/s: flops / time
235237
- byte/s, gbyte/s, tbyte/s: bytes / time
236238
- util: max(sum(flops<width>) / peak_flops<width>_time, sum(bytes) / peak_bandwidth_time)
237-
238-
For inclusive metrics (e.g. time) an additional column is printed showing the percentage
239-
each frame is of the full model.
240-
239+
- <metric>/%%: frame(metric) / sum(metric). Only availble for inclusive metrics (e.g. time)
241240
""",
242241
)
243242
argparser.add_argument(

0 commit comments

Comments
 (0)