diff --git a/.github/scripts/op_calculate_best_perf.py b/.github/scripts/op_calculate_best_perf.py index aa5c1e29f5..21bb0f2551 100644 --- a/.github/scripts/op_calculate_best_perf.py +++ b/.github/scripts/op_calculate_best_perf.py @@ -14,13 +14,32 @@ updated_cases = [] removed_cases = [] +def safe_float_convert(value): + try: + return float(value) if value.strip() else None + except (ValueError, AttributeError): + return None + def update_baseline(xpu_file, baseline_file, remove_missing=False): with open(xpu_file) as f: xpu_reader = csv.DictReader(f, delimiter=';') xpu_rows = list(xpu_reader) - xpu_fieldnames = xpu_reader.fieldnames # Keep original field order - fieldnames = [f for f in xpu_fieldnames if f not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']] - xpu_data = {make_key(row, fieldnames): (float(row['time(us)']), row) for row in xpu_rows} + xpu_fieldnames = xpu_reader.fieldnames + time_fields = ['time(us)', 'E2E total time(us)', 'E2E forward time(us)'] + fieldnames = [f for f in xpu_fieldnames if f not in time_fields] + xpu_data = {} + for row in xpu_rows: + key = make_key(row, fieldnames) + time_values = {} + if 'time(us)' in row: + time_val = safe_float_convert(row['time(us)']) + if time_val is not None: + time_values['time(us)'] = time_val + if 'E2E total time(us)' in row: + e2e_val = safe_float_convert(row['E2E total time(us)']) + if e2e_val is not None: + time_values['E2E total time(us)'] = e2e_val + xpu_data[key] = (time_values, row) with open(baseline_file) as f: baseline_reader = csv.DictReader(f, delimiter=';') @@ -28,8 +47,15 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False): baseline_fieldnames = baseline_reader.fieldnames # To add new parameter of new ops into baseline file - all_fieldnames = xpu_fieldnames + [f for f in baseline_fieldnames if f not in xpu_fieldnames] - fieldnames = [f for f in all_fieldnames if f not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']] + all_fieldnames = list(set(xpu_fieldnames + baseline_fieldnames)) + # Maintain original order as much as possible + ordered_fieldnames = [] + for f in xpu_fieldnames: + if f in all_fieldnames and f not in ordered_fieldnames: + ordered_fieldnames.append(f) + for f in baseline_fieldnames: + if f in all_fieldnames and f not in ordered_fieldnames: + ordered_fieldnames.append(f) baseline_keys = {make_key(row, fieldnames) for row in baseline_rows} xpu_keys = set(xpu_data.keys()) @@ -38,75 +64,117 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False): for row in baseline_rows: key = make_key(row, fieldnames) if key in xpu_data: - xpu_time, xpu_row = xpu_data[key] - baseline_time = float(row['time(us)']) - - if xpu_time < baseline_time: - updated_row = {} - for field in all_fieldnames: - updated_row[field] = xpu_row.get(field, row.get(field, '')) - updated_row['time(us)'] = str(xpu_time) - if 'E2E total time(us)' in row: - updated_row['E2E total time(us)'] = row['E2E total time(us)'] - updated_cases.append((key, baseline_time, xpu_time, updated_row)) - updated_rows.append(updated_row) - else: - ordered_row = {} - for field in all_fieldnames: - ordered_row[field] = row.get(field, '') - updated_rows.append(ordered_row) + xpu_times, xpu_row = xpu_data[key] + updated_row = {} + + # Copy all fields from baseline first + for field in ordered_fieldnames: + updated_row[field] = row.get(field, '') + + # Update with xpu values where they exist + for field in ordered_fieldnames: + if field in xpu_row and xpu_row[field]: + updated_row[field] = xpu_row[field] + + # Handle time fields + updated = False + if 'time(us)' in xpu_times and 'time(us)' in row: + baseline_time = safe_float_convert(row['time(us)']) + if baseline_time is not None: + xpu_time = xpu_times['time(us)'] + if xpu_time < baseline_time: + updated_row['time(us)'] = str(xpu_time) + updated = True + + if 'E2E total time(us)' in xpu_times and 'E2E total time(us)' in row: + baseline_e2e = safe_float_convert(row['E2E total time(us)']) + if baseline_e2e is not None: + xpu_e2e = xpu_times['E2E total time(us)'] + if xpu_e2e < baseline_e2e: + updated_row['E2E total time(us)'] = str(xpu_e2e) + updated = True + + if updated: + updated_cases.append((key, row, updated_row)) + updated_rows.append(updated_row) elif not remove_missing: ordered_row = {} - for field in all_fieldnames: + for field in ordered_fieldnames: ordered_row[field] = row.get(field, '') updated_rows.append(ordered_row) # Add new cases for key in xpu_keys - baseline_keys: - xpu_time, xpu_row = xpu_data[key] + xpu_times, xpu_row = xpu_data[key] new_row = {} - for field in all_fieldnames: + for field in ordered_fieldnames: new_row[field] = xpu_row.get(field, '') - new_row['time(us)'] = str(xpu_time) + + if 'time(us)' in xpu_times: + new_row['time(us)'] = str(xpu_times['time(us)']) + if 'E2E total time(us)' in xpu_times: + new_row['E2E total time(us)'] = str(xpu_times['E2E total time(us)']) + updated_rows.append(new_row) - added_cases.append((key, xpu_time, new_row)) + added_cases.append((key, xpu_times, new_row)) # Resolve removed cases if remove_missing: for key in baseline_keys - xpu_keys: removed_case = next(row for row in baseline_rows if make_key(row, fieldnames) == key) - removed_cases.append((key, float(removed_case['time(us)']), removed_case)) + removed_cases.append((key, removed_case)) if added_cases: print(f"\nAdded {len(added_cases)} new case(s):") - for key, time, row in added_cases: + for key, times, row in added_cases: print(f"\n[New Case] {format_case(key)}") - print(f"Time: {time} us") + if 'time(us)' in times: + print(f"Time: {times['time(us)']} us") + if 'E2E total time(us)' in times: + print(f"E2E Time: {times['E2E total time(us)']} us") print("Parameters:") for k, v in row.items(): - if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']: + if k not in time_fields: print(f" {k}: {v}") print("-" * 60) if updated_cases: print(f"\nUpdated {len(updated_cases)} case(s):") - for key, old_time, new_time, row in updated_cases: + for key, old_row, new_row in updated_cases: print(f"\n[Updated] {format_case(key)}") - print(f"Time: {old_time} us → {new_time} us") + if 'time(us)' in old_row and 'time(us)' in new_row: + old_time = safe_float_convert(old_row['time(us)']) + new_time = safe_float_convert(new_row['time(us)']) + if old_time is not None and new_time is not None and old_time != new_time: + print(f"Time: {old_time} us → {new_time} us") + + if 'E2E total time(us)' in old_row and 'E2E total time(us)' in new_row: + old_e2e = safe_float_convert(old_row['E2E total time(us)']) + new_e2e = safe_float_convert(new_row['E2E total time(us)']) + if old_e2e is not None and new_e2e is not None and old_e2e != new_e2e: + print(f"E2E Time: {old_e2e} us → {new_e2e} us") + print("Parameters:") - for k, v in row.items(): - if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']: + for k, v in new_row.items(): + if k not in time_fields: print(f" {k}: {v}") print("-" * 60) if remove_missing and removed_cases: print(f"\nRemoved {len(removed_cases)} case(s):") - for key, time, row in removed_cases: + for key, row in removed_cases: print(f"\n[Removed] {format_case(key)}") - print(f"Time: {time} us") + if 'time(us)' in row: + time_val = safe_float_convert(row['time(us)']) + if time_val is not None: + print(f"Time: {time_val} us") + if 'E2E total time(us)' in row: + e2e_val = safe_float_convert(row['E2E total time(us)']) + if e2e_val is not None: + print(f"E2E Time: {e2e_val} us") print("Parameters:") for k, v in row.items(): - if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']: + if k not in time_fields: print(f" {k}: {v}") print("-" * 60) @@ -117,7 +185,7 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False): Path(baseline_file).rename(backup_file) with open(baseline_file, 'w', newline='') as f: - writer = csv.DictWriter(f, fieldnames=all_fieldnames, delimiter=';') + writer = csv.DictWriter(f, fieldnames=ordered_fieldnames, delimiter=';') writer.writeheader() writer.writerows(updated_rows) diff --git a/.github/scripts/op_perf_comparison.py b/.github/scripts/op_perf_comparison.py index 0aeaa0893e..f531e4d1f7 100644 --- a/.github/scripts/op_perf_comparison.py +++ b/.github/scripts/op_perf_comparison.py @@ -2,7 +2,9 @@ To compare the op perf diff # usage python op_perf_comparison.py --xpu_file /path/to/xpu/performance/result/dir/forward.csv --baseline_file /path/to/baselineence/dir/baseline.csv - +--profile-only: Only compare record['time(us)'] +--e2e-only: Only compare record['E2E total time(us)'] +Default: Compare both record['time(us)'] and record['E2E total time(us)'] in same table """ import pandas as pd @@ -43,12 +45,15 @@ def write_to_github_summary(content): def format_parameters(record): params = [] for key, value in record.items(): - if key not in ['case_name', 'op_name', 'datatype', 'time_xpu_file', 'time_baseline_file', 'difference', 'change', 'E2E total time(us)', 'E2E forward time(us)']: + if key not in ['case_name', 'op_name', 'datatype', + 'profile_time_xpu', 'profile_time_base', 'profile_diff', 'profile_change', + 'e2e_time_xpu', 'e2e_time_base', 'e2e_diff', 'e2e_change', + 'E2E total time(us)', 'E2E forward time(us)']: if value != "NULL": params.append(f"{key}: {value}") return "
".join(params) -def display_comparison(results, threshold, xpu_file): +def display_comparison(results, threshold, xpu_file, compare_both): if 'forward' in xpu_file.lower(): direction = "Forward" elif 'backward' in xpu_file.lower(): @@ -61,97 +66,132 @@ def display_comparison(results, threshold, xpu_file): write_to_github_summary(f"## {direction} No outlier exceeding ({threshold:.0%})") return - results['diff_float'] = results['difference'].str.rstrip('%').astype(float) - regression = results[results['change'] == '↓'].sort_values('diff_float', ascending=False) - improvement = results[results['change'] == '↑'].sort_values('diff_float') + # Prepare display records - always include both metrics when available + display_records = [] + for _, row in results.iterrows(): + record = display_row(row) + display_record = { + 'Case Name': record['case_name'], + 'Op Name': record['op_name'], + 'Datatype': record['datatype'], + 'Parameters': format_parameters(record) + } - if not regression.empty: - print("\n🔴 Regression:") - display_records = [] - for _, row in regression.iterrows(): - record = display_row(row) - display_records.append({ - 'Case Name': record['case_name'], - 'Op Name': record['op_name'], - 'Datatype': record['datatype'], - 'Parameters': format_parameters(record), - 'Current Time(us)': record['time_xpu_file'], - 'Baseline Time(us)': record['time_baseline_file'], - 'Difference': record['difference'] + # Always try to include profile time if it exists in the data + if 'profile_time_xpu' in record or 'profile_time_base' in record: + display_record.update({ + 'Profile Current(us)': record.get('profile_time_xpu', 'N/A'), + 'Profile Baseline(us)': record.get('profile_time_base', 'N/A'), + 'Profile Diff': record.get('profile_diff', 'N/A'), + 'Profile Change': record.get('profile_change', '') + }) + + # Always try to include E2E time if it exists in the data + if 'e2e_time_xpu' in record or 'e2e_time_base' in record: + display_record.update({ + 'E2E Current(us)': record.get('e2e_time_xpu', 'N/A'), + 'E2E Baseline(us)': record.get('e2e_time_base', 'N/A'), + 'E2E Diff': record.get('e2e_diff', 'N/A'), + 'E2E Change': record.get('e2e_change', '') }) + display_records.append(display_record) + + # Classify records based on changes + regression_records = [] + improvement_records = [] + mixed_records = [] + + for record in results.to_dict('records'): + profile_change = record.get('profile_change') + e2e_change = record.get('e2e_change') + + profile_regression = profile_change == '↓' + profile_improve = profile_change == '↑' + e2e_regression = e2e_change == '↓' + e2e_improve = e2e_change == '↑' + + if (profile_regression and e2e_improve) or (profile_improve and e2e_regression): + mixed_records.append(record) + elif profile_regression or e2e_regression: + regression_records.append(record) + elif profile_improve or e2e_improve: + improvement_records.append(record) + + # Print results + if regression_records: + print("\n🔴 Regression:") + regression_display = [r for r in display_records + if (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓') + and not (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')] print(tabulate( - display_records, + regression_display, headers="keys", tablefmt='grid', showindex=False, floatfmt=".2f" )) - if not improvement.empty: + if improvement_records: print("\n🟢 Improvement:") - display_records = [] - for _, row in improvement.iterrows(): - record = display_row(row) - display_records.append({ - 'Case Name': record['case_name'], - 'Op Name': record['op_name'], - 'Datatype': record['datatype'], - 'Parameters': format_parameters(record), - 'Current Time(us)': record['time_xpu_file'], - 'Baseline Time(us)': record['time_baseline_file'], - 'Difference': record['difference'] - }) + improvement_display = [r for r in display_records + if (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑') + and not (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')] + print(tabulate( + improvement_display, + headers="keys", + tablefmt='grid', + showindex=False, + floatfmt=".2f" + )) + if mixed_records: + print("\n🟡 Mixed Changes (one metric improves, another regression):") + mixed_display = [r for r in display_records + if ((r.get('Profile Change', '') == '↑' and r.get('E2E Change', '') == '↓') or + (r.get('Profile Change', '') == '↓' and r.get('E2E Change', '') == '↑'))] print(tabulate( - display_records, + mixed_display, headers="keys", tablefmt='grid', showindex=False, floatfmt=".2f" )) - # Print Summary on Github Action Summary + + # Generate GitHub summary summary_output = f"## {direction} Performance Comparison Results\n" - if not regression.empty: - summary_output += f"\n### 🔴 {direction} Regression\n" - display_records = [] - for _, row in regression.iterrows(): - record = display_row(row) - display_records.append({ - 'Case Name': record['case_name'], - 'Op Name': record['op_name'], - 'Datatype': record['datatype'], - 'Parameters': format_parameters(record), - 'Current Time(us)': record['time_xpu_file'], - 'Baseline Time(us)': record['time_baseline_file'], - 'Difference': record['difference'] - }) + if regression_records: + summary_output += "\n### 🔴 Regression\n" summary_output += tabulate( - display_records, + [r for r in display_records + if (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓') + and not (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')], headers="keys", tablefmt='github', showindex=False, floatfmt=".2f" ) + "\n" - if not improvement.empty: - summary_output += f"\n### 🟢 {direction} Improvement\n" - display_records = [] - for _, row in improvement.iterrows(): - record = display_row(row) - display_records.append({ - 'Case Name': record['case_name'], - 'Op Name': record['op_name'], - 'Datatype': record['datatype'], - 'Parameters': format_parameters(record), - 'Current Time(us)': record['time_xpu_file'], - 'Baseline Time(us)': record['time_baseline_file'], - 'Difference': record['difference'] - }) + if improvement_records: + summary_output += "\n### 🟢 Improvement\n" + summary_output += tabulate( + [r for r in display_records + if (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑') + and not (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')], + headers="keys", + tablefmt='github', + showindex=False, + floatfmt=".2f" + ) + "\n" + if mixed_records: + summary_output += "\n### 🟡 Mixed Changes\n" + summary_output += "One metric improves while another regression\n" summary_output += tabulate( - display_records, + [r for r in display_records + if ((r.get('Profile Change', '') == '↑' and r.get('E2E Change', '') == '↓') or + (r.get('Profile Change', '') == '↓' and r.get('E2E Change', '') == '↑'))], headers="keys", tablefmt='github', showindex=False, @@ -160,50 +200,100 @@ def display_comparison(results, threshold, xpu_file): write_to_github_summary(summary_output) -def compare_op_time_values(xpu_file, baseline_file, threshold=0.05, output_file=None): - df_xpu = pd.read_csv(xpu_file, sep=';') - df_baseline = pd.read_csv(baseline_file, sep=';') +def compare_time_values(xpu_file, baseline_file, threshold=0.05, profile_only=False, e2e_only=False): + def prepare_df(df): + df.columns = df.columns.str.strip() + if 'time(us)' not in df.columns: + df['time(us)'] = float('nan') + if 'E2E total time(us)' not in df.columns: + df['E2E total time(us)'] = float('nan') + return df + + df_xpu = prepare_df(pd.read_csv(xpu_file, sep=';')) + df_baseline = prepare_df(pd.read_csv(baseline_file, sep=';')) + + for col in ['time(us)', 'E2E total time(us)']: + df_xpu[col] = pd.to_numeric(df_xpu[col], errors='coerce') + df_baseline[col] = pd.to_numeric(df_baseline[col], errors='coerce') records_xpu = [preprocess_row(row) for _, row in df_xpu.iterrows()] records_baseline = [preprocess_row(row) for _, row in df_baseline.iterrows()] - dict_xpu = { - tuple((k, str(v)) for k, v in record.items() if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']): - record['time(us)'] - for record in records_xpu - } - dict_baseline = { - tuple((k, str(v)) for k, v in record.items() if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']): - record['time(us)'] - for record in records_baseline + data_dict = { + 'xpu': {'profile': {}, 'e2e': {}}, + 'baseline': {'profile': {}, 'e2e': {}} } - common_keys = set(dict_xpu.keys()) & set(dict_baseline.keys()) + + for record, source in [(records_xpu, 'xpu'), (records_baseline, 'baseline')]: + for r in record: + key = tuple((k, str(v)) for k, v in r.items() + if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']) + + for time_type in ['profile', 'e2e']: + col = 'time(us)' if time_type == 'profile' else 'E2E total time(us)' + if col in r: + try: + time_val = float(r[col]) + if not pd.isna(time_val): + data_dict[source][time_type][key] = time_val + except (ValueError, TypeError): + continue + results = [] + compare_both = not profile_only and not e2e_only + all_keys = set().union(*[set(data_dict[s][t].keys()) + for s in data_dict for t in data_dict[s]]) - for key in common_keys: - time_xpu = dict_xpu[key] - time_baseline = dict_baseline[key] - - # Skip comparison if time_xpu or time_baseline is 0 - if time_xpu == 0 or time_baseline == 0: - continue - - diff = (time_baseline - time_xpu) / time_xpu - # Compare Time, Lower is better - if abs(diff) > threshold: - record = dict(key) - print(record) - record.update({ - 'time_xpu_file': time_xpu, - 'time_baseline_file': time_baseline, - 'difference': f"{diff:.2%}", - 'change': "↑" if diff > 0 else "↓" - }) - results.append(record) + for key in all_keys: + record = dict(key) + should_include = False - result_df = pd.DataFrame(results) if results else pd.DataFrame() - display_comparison(result_df, threshold, xpu_file) + if not e2e_only and key in data_dict['xpu']['profile'] and key in data_dict['baseline']['profile']: + xpu_time = data_dict['xpu']['profile'][key] + base_time = data_dict['baseline']['profile'][key] + if xpu_time != 0 and base_time != 0: + try: + diff = (base_time - xpu_time) / xpu_time + record.update({ + 'profile_time_xpu': xpu_time, + 'profile_time_base': base_time, + 'profile_diff': f"{diff:.2%}", + 'profile_change': "↑" if diff > threshold else "↓" if diff < -threshold else "" + }) + if abs(diff) > threshold: + should_include = True + except (TypeError, ValueError): + pass + + if not profile_only and key in data_dict['xpu']['e2e'] and key in data_dict['baseline']['e2e']: + xpu_time = data_dict['xpu']['e2e'][key] + base_time = data_dict['baseline']['e2e'][key] + + if xpu_time != 0 and base_time != 0: + try: + diff = (base_time - xpu_time) / xpu_time + record.update({ + 'e2e_time_xpu': xpu_time, + 'e2e_time_base': base_time, + 'e2e_diff': f"{diff:.2%}", + 'e2e_change': "↑" if diff > threshold else "↓" if diff < -threshold else "" + }) + if abs(diff) > threshold: + should_include = True + except (TypeError, ValueError): + pass + + if compare_both: + if should_include: + results.append(record) + else: + if ((profile_only and 'profile_change' in record and record['profile_change']) or + (e2e_only and 'e2e_change' in record and record['e2e_change'])): + results.append(record) + + result_df = pd.DataFrame(results) if results else pd.DataFrame() + display_comparison(result_df, threshold, xpu_file, compare_both) def main(): parser = argparse.ArgumentParser(description='Compare time values between two CSV files') @@ -211,19 +301,40 @@ def main(): parser.add_argument('-b', '--baseline_file', required=True, help="XPU OP baseline result csv files dir") parser.add_argument('-t', '--threshold', type=float, default=0.10, help='Threshold for time difference (default: 0.10 for 10%)') + parser.add_argument('--profile-only', action='store_true', + help='Only compare profile time (time(us))') + parser.add_argument('--e2e-only', action='store_true', + help='Only compare E2E time (E2E total time(us))') args = parser.parse_args() - print(f" Compared file: {args.xpu_file} 和 {args.baseline_file}") + if args.profile_only and args.e2e_only: + raise ValueError("Cannot specify both --profile-only and --e2e-only") + + print(f" Compared file: {args.xpu_file} and {args.baseline_file}") print(f" Threshold: {args.threshold:.0%}") + if args.profile_only: + print(" Comparing only profile time (time(us))") + elif args.e2e_only: + print(" Comparing only E2E time (E2E total time(us))") + else: + print(" Comparing both profile time and E2E time in same table") + write_to_github_summary("## Performance Comparison Set") write_to_github_summary(f"- Threshold: {args.threshold:.0%}") + if args.profile_only: + write_to_github_summary("- Comparing only profile time (time(us))") + elif args.e2e_only: + write_to_github_summary("- Comparing only E2E time (E2E total time(us))") + else: + write_to_github_summary("- Comparing both profile time and E2E time in same table") - compare_op_time_values( + compare_time_values( xpu_file=args.xpu_file, baseline_file=args.baseline_file, threshold=args.threshold, + profile_only=args.profile_only, + e2e_only=args.e2e_only ) - if __name__ == "__main__": main() diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 9760e6d960..35a5a62536 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -152,18 +152,25 @@ jobs: run: | REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ --json body -q .body |grep "Inductor-XPU-OP-Benchmark-Data" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-XPU-OP-Benchmark-Data-*" rm -rf ${{ github.workspace }}/reference mkdir -p ${{ github.workspace }}/reference - mv -f Inductor-XPU-OP-Benchmark-Data-*-Updated/* ${{ github.workspace }}/reference - mkdir -p ${{ github.workspace }}/baseline - if [[ -f "${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv" ]]; then - cp ${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv ${{ github.workspace }}/baseline - cp ${{ github.workspace }}/reference/new_baseline/baseline_backward_op_summary.csv ${{ github.workspace }}/baseline + if [[ -n "${REFERENCE_RUN_ID}" ]]; then + echo "Using reference run ID: ${REFERENCE_RUN_ID}" + gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-XPU-OP-Benchmark-Data-*" + mv -f Inductor-XPU-OP-Benchmark-Data-*-Updated/* ${{ github.workspace }}/reference + + if [[ -f "${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv" ]]; then + cp ${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv ${{ github.workspace }}/baseline + cp ${{ github.workspace }}/reference/new_baseline/baseline_backward_op_summary.csv ${{ github.workspace }}/baseline + else + cp ${{ github.workspace }}/reference/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv + cp ${{ github.workspace }}/reference/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv + fi else - cp ${{ github.workspace }}/reference/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv - cp ${{ github.workspace }}/reference/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv + echo "No reference run ID found, using local op_benchmark as baseline" + cp ${{ github.workspace }}/op_benchmark/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv + cp ${{ github.workspace }}/op_benchmark/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv fi - name: Check the OP Regression run: | @@ -175,7 +182,7 @@ jobs: - name: Update OP Baseline run: | pip install tabulate pandas - mkdir ${{ github.workspace }}/new_baseline + mkdir -p ${{ github.workspace }}/new_baseline cp ${{ github.workspace }}/baseline/baseline*.csv ${{ github.workspace }}/new_baseline # Update forward op python ${{ github.workspace }}/.github/scripts/op_calculate_best_perf.py --xpu ${{ github.workspace }}/op_benchmark/forward_op_summary.csv --baseline ${{ github.workspace }}/new_baseline/baseline_forward_op_summary.csv -r diff --git a/test/microbench/adaptive_avg_pool2d.py b/test/microbench/adaptive_avg_pool2d.py index a334a9252b..656c74ff73 100644 --- a/test/microbench/adaptive_avg_pool2d.py +++ b/test/microbench/adaptive_avg_pool2d.py @@ -1,18 +1,17 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" - shape_list = [ (8, 512, 32, 32, (7, 7)), (8, 256, 56, 56, (14, 14)), ] -num_iter = 20 +backward = True -def Adaptive_AVGPool2d(shape, dtype, channels_last, backward): +def Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device): N, C, H, W, output_size = ( shape[0], shape[1], @@ -48,13 +47,38 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - Adaptive_AVGPool2d(shape, dtype, channels_last, backward) + Adaptive_AVGPool2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -69,20 +93,44 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Adaptive_AVGPool2d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Adaptive_AVGPool2d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/avg_pool2d.py b/test/microbench/avg_pool2d.py index 658bf8a56f..cbb1ab2e2c 100644 --- a/test/microbench/avg_pool2d.py +++ b/test/microbench/avg_pool2d.py @@ -1,20 +1,19 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" - shape_list = [ (16, 24, 112, 112, (3), (2)), (16, 1984, 7, 7, (3, 2), (2, 1)), (64, 1024, 112, 112, (6), (4)), (16, 2048, 224, 224, (3), (2)), ] -num_iter = 20 +backward = True -def AVGPool2d(shape, dtype, channels_last, backward): +def AVGPool2d(shape, dtype, channels_last, backward, device): N, C, H, W, kernel_size, stride = ( shape[0], shape[1], @@ -55,13 +54,38 @@ def AVGPool2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + AVGPool2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + AVGPool2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - AVGPool2d(shape, dtype, channels_last, backward) + AVGPool2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -78,20 +102,45 @@ def AVGPool2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - AVGPool2d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - AVGPool2d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/avg_pool3d.py b/test/microbench/avg_pool3d.py index 6d9057f89d..77cf810341 100644 --- a/test/microbench/avg_pool3d.py +++ b/test/microbench/avg_pool3d.py @@ -1,19 +1,18 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" - shape_list = [ (16, 24, 28, 19, 19, (3), (2)), (16, 1984, 7, 7, 7, (3, 2, 2), (2, 1, 2)), (64, 1024, 14, 14, 14, (6), (4)), ] -num_iter = 20 +backward = True -def AVGPool3d(shape, dtype, channels_last, backward): +def AVGPool3d(shape, dtype, channels_last, backward, device): N, C, D, H, W, kernel_size, stride = ( shape[0], shape[1], @@ -55,13 +54,38 @@ def AVGPool3d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + AVGPool3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + AVGPool3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - AVGPool3d(shape, dtype, channels_last, backward) + AVGPool3d(shape, dtype, channels_last, backward, args.device) # go print( @@ -78,20 +102,45 @@ def AVGPool3d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - AVGPool3d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - AVGPool3d(shape, dtype, channels_last, backward=True) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/batch_norm_1d.py b/test/microbench/batch_norm_1d.py index b55d96de72..0efc3da933 100644 --- a/test/microbench/batch_norm_1d.py +++ b/test/microbench/batch_norm_1d.py @@ -1,56 +1,92 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [((64, 8), (8)), ((4, 128, 15000), (128)), ((4, 256, 512), (256))] +backward = True -for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for shape in shape_list: - backward = True - # input - input = torch.randn(shape[0], device=device, dtype=dtype) - - if backward: - input.requires_grad_(True) - - # warm up - m = torch.nn.BatchNorm1d(shape[1], device=device) - output = m(input) - - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; num_features:", - shape[1], - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - m = torch.nn.BatchNorm1d(shape[1], device=device) - output = m(input) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + +def BTN1d(m, input, backward, device): + output = m(input) + if backward: + gy = torch.empty_like(output) + output.backward(gy) + + +def run_profile(m, input, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - m = torch.nn.BatchNorm1d(shape[1], device=device) - output = m(input) + BTN1d(m, input, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(m, input, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + BTN1d(m, input, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape[0], device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + m = torch.nn.BatchNorm1d(shape[1], device=args.device) + # warm up + BTN1d(m, input, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; num_features:", + shape[1], + "; backward:", + backward, + ) + + if not args.e2e_only: + run_profile(m, input, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(m, input, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/batch_norm_2d.py b/test/microbench/batch_norm_2d.py index 781e00881d..e8f83ad379 100644 --- a/test/microbench/batch_norm_2d.py +++ b/test/microbench/batch_norm_2d.py @@ -1,10 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [ (256, 256, 56, 56, 256), (256, 2048, 7, 7, 2048), @@ -13,9 +12,10 @@ (4, 8, 640, 1024, 8), (4, 48, 20, 32, 48), ] +backward = True -def BTN2d(shape, dtype, channels_last, backward): +def BTN2d(shape, dtype, channels_last, backward, device): N, C, H, W, num_features = shape[0], shape[1], shape[2], shape[3], shape[4] if channels_last: @@ -39,13 +39,38 @@ def BTN2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for shape in shape_list: +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + BTN2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + BTN2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - BTN2d(shape, dtype, channels_last, backward) + BTN2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -60,20 +85,45 @@ def BTN2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - BTN2d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - BTN2d(shape, dtype, channels_last, backward=True) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/batch_norm_3d.py b/test/microbench/batch_norm_3d.py index 26b3a0a981..7a02f7cf2f 100644 --- a/test/microbench/batch_norm_3d.py +++ b/test/microbench/batch_norm_3d.py @@ -1,14 +1,14 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [(2, 5, 6, 3, 5, 5), (2, 8, 64, 64, 64, 8), (16, 16, 128, 128, 256, 16)] +backward = True -def BTN3d(shape, dtype, channels_last, backward): +def BTN3d(shape, dtype, channels_last, backward, device): N, C, D, H, W, num_features = ( shape[0], shape[1], @@ -39,13 +39,38 @@ def BTN3d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for shape in shape_list: +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + BTN3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + BTN3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - BTN3d(shape, dtype, channels_last, backward) + BTN3d(shape, dtype, channels_last, backward, args.device) # go print( @@ -60,20 +85,44 @@ def BTN3d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - BTN3d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - BTN3d(shape, dtype, channels_last, backward=True) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/col2im.py b/test/microbench/col2im.py index 88a0593f2f..29dccdb9db 100644 --- a/test/microbench/col2im.py +++ b/test/microbench/col2im.py @@ -1,10 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [ ((1, 147, 1359556), (1200, 1200)), ((1, 147, 36100), (224, 224)), @@ -13,56 +12,104 @@ ] kernel_size = (7, 7) dilation = (6, 6) - backward = True -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape[0], dtype=dtype, device=device, requires_grad=True) - if backward: - input.requires_grad_(True) - output_size = shape[1] - - # warm up - output = torch.nn.functional.fold( - input, output_size, kernel_size, dilation, 1, 1 - ) - if backward: - torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) - - # go - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; output_size:", - shape[1], - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - output = torch.nn.functional.fold( - input, output_size, kernel_size, dilation, 1, 1 - ) - if backward: - torch.autograd.grad( - output, input, grad_outputs=torch.ones_like(output) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Col2im(input, output_size, kernel_size, dilation, backward, device): + output = torch.nn.functional.fold(input, output_size, kernel_size, dilation, 1, 1) + if backward: + torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) + + +def run_profile(input, output_size, kernel_size, dilation, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = torch.nn.functional.fold( - input, output_size, kernel_size, dilation, 1, 1 + Col2im(input, output_size, kernel_size, dilation, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, output_size, kernel_size, dilation, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Col2im(input, output_size, kernel_size, dilation, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn( + shape[0], dtype=dtype, device=args.device, requires_grad=True ) if backward: - torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + output_size = shape[1] + # warm up + Col2im(input, output_size, kernel_size, dilation, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; output_size:", + shape[1], + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, + output_size, + kernel_size, + dilation, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + output_size, + kernel_size, + dilation, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distance.cdist.py b/test/microbench/distance.cdist.py index c7047b86e2..1d638bbb57 100644 --- a/test/microbench/distance.cdist.py +++ b/test/microbench/distance.cdist.py @@ -1,11 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 shape_list = [ ((8, 16), (2, 16)), ((10, 8192), (10, 8192)), @@ -13,60 +11,113 @@ ((4, 512, 512), (4, 513, 512)), ((1, 512, 8192), (1, 1024, 8192)), ] +backward = True + + +def Cdist(input1, input2, backward, p, compute_mode, device): + output = torch.cdist(input1, input2, p, compute_mode) + if backward: + gy = torch.empty_like(output) + output.backward(gy) + + +def run_profile(input1, input2, backward, p, compute_mode, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Cdist(input1, input2, backward, p, compute_mode, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input1, input2, backward, p, compute_mode, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Cdist(input1, input2, backward, p, compute_mode, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") -for shape in shape_list: - for p in [0, 1, 2]: - for compute_mode in [ - "use_mm_for_euclid_dist_if_necessary", - "use_mm_for_euclid_dist", - "donot_use_mm_for_euclid_dist", - ]: - for dtype in [torch.float32]: - input1 = torch.rand(shape[0], device=device, dtype=dtype) - input2 = torch.rand(shape[1], device=device, dtype=dtype) - if backward: - input1.requires_grad_(True) - input2.requires_grad_(True) - - # warm up - output = torch.cdist(input1, input2, p, compute_mode) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; P:", - p, - "; mode:", - compute_mode, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.cdist(input1, input2, p, compute_mode) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.cdist(input1, input2, p, compute_mode) + +def benchmark(args): + for shape in shape_list: + for p in [0, 1, 2]: + for compute_mode in [ + "use_mm_for_euclid_dist_if_necessary", + "use_mm_for_euclid_dist", + "donot_use_mm_for_euclid_dist", + ]: + for dtype in [torch.float32]: + input1 = torch.rand(shape[0], device=args.device, dtype=dtype) + input2 = torch.rand(shape[1], device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input1.requires_grad_(True) + input2.requires_grad_(True) + # warm up + Cdist(input1, input2, backward, p, compute_mode, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; P:", + p, + "; mode:", + compute_mode, + "; backward:", + backward, + ) + + if not args.e2e_only: + run_profile( + input1, + input2, + backward, + p, + compute_mode, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input1, + input2, + backward, + p, + compute_mode, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distance.pdist.py b/test/microbench/distance.pdist.py index b0f59dd6ef..7c737347db 100644 --- a/test/microbench/distance.pdist.py +++ b/test/microbench/distance.pdist.py @@ -1,46 +1,83 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 forward_shape_list = [(2048, 256), (2048, 8192), (16, 8192 * 4)] backward_shape_list = [(256, 256), (256, 8192), (16, 8192 * 4)] -for backward in [False, True]: - shape_list = backward_shape_list if backward else forward_shape_list - for shape in shape_list: - for dtype in [torch.float32]: - input = torch.rand(shape, device=device, dtype=dtype) - if backward: - input.requires_grad_(True) - - # warm up - b = torch.nn.functional.pdist(input, 2) - - # go - print("shape:", shape, "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(20): - b = torch.nn.functional.pdist(input, 2) - if backward: - gy = torch.empty_like(b) - b.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - b = torch.nn.functional.pdist(input, 2) + +def Pdist(input, backward, device): + b = torch.nn.functional.pdist(input, 2) + if backward: + gy = torch.empty_like(b) + b.backward(gy) + + +def run_profile(input, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Pdist(input, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Pdist(input, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for backward in [False, True]: + shape_list = backward_shape_list if backward else forward_shape_list + for shape in shape_list: + for dtype in [torch.float32]: + input = torch.rand(shape, device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(b) - b.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + # warm up + Pdist(input, backward, args.device) + + # go + print("shape:", shape, "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.bernoulli.py b/test/microbench/distribution.bernoulli.py index 9f36400b32..96d19233a6 100644 --- a/test/microbench/distribution.bernoulli.py +++ b/test/microbench/distribution.bernoulli.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -5,18 +6,45 @@ shape_list = [(8192, 8192)] backward = False -num_iter = 20 -if __name__ == "__main__": + +def Bernoulli(input, p, device): + input.bernoulli_(p) + + +def run_profile(input, p, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Bernoulli(input, p, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, p, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Bernoulli(input, p, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for p in [0.5, torch.tensor(0.5)]: - input = torch.zeros( - shape, dtype=torch.bfloat16, device=torch.device("xpu") - ) - + input = torch.zeros(shape, dtype=dtype, device=args.device) # warm up - input.bernoulli_(0.5) + Bernoulli(input, p, args.device) # go print( @@ -29,20 +57,30 @@ "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - input.bernoulli_(p) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.bernoulli_(p) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(input, p, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, p, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.cauchy.py b/test/microbench/distribution.cauchy.py index 259ac8ed88..ddea79890e 100644 --- a/test/microbench/distribution.cauchy.py +++ b/test/microbench/distribution.cauchy.py @@ -1,34 +1,76 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - # warm up - input.cauchy_() - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - input.cauchy_() - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Cauchy(input, device): + input.cauchy_() + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - input.cauchy_() - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Cauchy(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Cauchy(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + # warm up + Cauchy(input, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.exponential.py b/test/microbench/distribution.exponential.py index e3616e5114..c3d30cfca9 100644 --- a/test/microbench/distribution.exponential.py +++ b/test/microbench/distribution.exponential.py @@ -1,34 +1,76 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - # warm up - input.exponential_(0.5) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - input.exponential_(0.5) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Exponential(input, device): + input.exponential_(0.5) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - input.exponential_(0.5) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Exponential(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Exponential(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + # warm up + Exponential(input, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.geometric.py b/test/microbench/distribution.geometric.py index 095c7d1bae..8c8b9bfabd 100644 --- a/test/microbench/distribution.geometric.py +++ b/test/microbench/distribution.geometric.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -5,15 +6,44 @@ shape_list = [(8192, 8192)] backward = False -num_iter = 20 -if __name__ == "__main__": + +def Geometric(input, device): + input.geometric_(0.5) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Geometric(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Geometric(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) - + input = torch.randn(shape, dtype=torch.bfloat16, device=args.device) # warm up - input.geometric_(0.5) + Geometric(input, args.device) # go print( @@ -26,20 +56,30 @@ "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - input.geometric_(0.5) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.geometric_(0.5) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.log_normal.py b/test/microbench/distribution.log_normal.py index 2b081e7cae..ca5c4bdac3 100644 --- a/test/microbench/distribution.log_normal.py +++ b/test/microbench/distribution.log_normal.py @@ -1,34 +1,76 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - # warm up - input.log_normal_(128, 128) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - input.log_normal_(128, 128) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Log_normal(input, device): + input.log_normal_(128, 128) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - input.log_normal_(128, 128) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Log_normal(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Log_normal(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + # warm up + Log_normal(input, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.multinomial.py b/test/microbench/distribution.multinomial.py index 9ff254dcbc..b0601bb94c 100644 --- a/test/microbench/distribution.multinomial.py +++ b/test/microbench/distribution.multinomial.py @@ -1,51 +1,93 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for replacement in [False, True]: - for num_samples in [2, 128]: - input = torch.randn(shape, dtype=dtype, device=device).abs() - # warm up - input.multinomial(num_samples, replacement) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; replacement:", - replacement, - "; num_samples:", - num_samples, - "; backward:", - backward, - ) - with profile( - activities=[ - ProfilerActivity.CPU, - ProfilerActivity.XPU, - ], - record_shapes=True, - ) as prof: - for _ in range(num_iter): - input.multinomial(num_samples, replacement) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.multinomial(num_samples, replacement) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + +def Multinomial(input, replacement, num_samples, device): + input.multinomial(num_samples, replacement) + + +def run_profile(input, replacement, num_samples, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Multinomial(input, replacement, num_samples, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, replacement, num_samples, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Multinomial(input, replacement, num_samples, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for replacement in [False, True]: + for num_samples in [2, 128]: + input = torch.randn(shape, dtype=dtype, device=args.device).abs() + # warm up + Multinomial(input, replacement, num_samples, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; replacement:", + replacement, + "; num_samples:", + num_samples, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, replacement, num_samples, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e( + input, replacement, num_samples, args.device, args.num_iter + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.normal.py b/test/microbench/distribution.normal.py index f8067ebe8d..54112ff0eb 100644 --- a/test/microbench/distribution.normal.py +++ b/test/microbench/distribution.normal.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -5,32 +6,71 @@ shape_list = [(8192, 8192)] backward = False -num_iter = 20 -if __name__ == "__main__": + +def Normal(input, device): + input.normal_() + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Normal(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Normal(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) - + input = torch.randn(shape, dtype=dtype, device=args.device) # warm up - input.normal_() + Normal(input, args.device) # go print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - input.normal_() - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.normal_() - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.random.py b/test/microbench/distribution.random.py index b6a7d21831..df861a82de 100644 --- a/test/microbench/distribution.random.py +++ b/test/microbench/distribution.random.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -5,32 +6,71 @@ shape_list = [(8192, 8192)] backward = False -num_iter = 20 -if __name__ == "__main__": + +def Random(input, device): + input.random_(-(2**8), 2**8) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Random(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Random(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) - + input = torch.randn(shape, dtype=dtype, device=args.device) # warm up - input.random_(-(2**8), 2**8) + Random(input, args.device) # go print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - input.random_(-(2**8), 2**8) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.random_(-(2**8), 2**8) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/distribution.uniform.py b/test/microbench/distribution.uniform.py index 14ad76f051..25390fb709 100644 --- a/test/microbench/distribution.uniform.py +++ b/test/microbench/distribution.uniform.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -5,32 +6,71 @@ shape_list = [(8192, 8192)] backward = False -num_iter = 20 -if __name__ == "__main__": + +def Uniform(input, device): + input.uniform_() + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Uniform(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Uniform(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) - + input = torch.randn(shape, dtype=dtype, device=args.device) # warm up - input.uniform_() + Uniform(input, args.device) # go print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - input.uniform_() - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - input.uniform_() - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/dropout.py b/test/microbench/dropout.py index c79dcd1a5e..eb9f06cdf1 100644 --- a/test/microbench/dropout.py +++ b/test/microbench/dropout.py @@ -1,27 +1,58 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8192, 8192), (16, 1024)] -num_iter = 20 +backward = True -if __name__ == "__main__": - backward = True + +def Dropout(input, grad_dpcpp, dropout, backward, device): + output = dropout(input) + if backward: + output.backward(grad_dpcpp) + + +def run_profile(input, grad_dpcpp, dropout, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Dropout(input, grad_dpcpp, dropout, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, grad_dpcpp, dropout, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Dropout(input, grad_dpcpp, dropout, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: H, W = (shape[0], shape[1]) - input = torch.randn((H, W)).to(dtype=dtype, device="xpu") + input = torch.randn((H, W)).to(dtype=dtype, device=args.device) dropout = torch.nn.Dropout(p=0.5) - dropout.to(device="xpu", dtype=dtype) - grad_dpcpp = torch.randn((H, W)).to(device="xpu", dtype=dtype) - input.requires_grad_(True) - - # warm up - output = dropout(input) + dropout.to(device=args.device, dtype=dtype) if backward: - output.backward(grad_dpcpp) + grad_dpcpp = torch.randn((H, W)).to(device=args.device, dtype=dtype) + input.requires_grad_(True) + # warm up + Dropout(input, grad_dpcpp, dropout, backward, args.device) # go print( @@ -34,24 +65,34 @@ "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = dropout(input) - if backward: - output.backward(grad_dpcpp) - print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100)) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = dropout(input) - if backward: - output.backward(grad_dpcpp) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + input, grad_dpcpp, dropout, backward, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e( + input, grad_dpcpp, dropout, backward, args.device, args.num_iter + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/eltwise.add.py b/test/microbench/eltwise.add.py index e07ea315eb..6b0087e884 100644 --- a/test/microbench/eltwise.add.py +++ b/test/microbench/eltwise.add.py @@ -1,56 +1,95 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = False -num_iter = 20 - shape_list = [ ((8192, 8192), (8192, 8192)), # contiguous input ((100000, 10000), (100000, 10000)), # non-contiguous input ((8190, 8190), (8190, 8190)), # non-vectorized input ((8192, 8192), (0.5)), # scalar input ] +backward = False + -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - a = torch.randn(shape[0], dtype=dtype, device=device) - if shape[1] == 0.5: - b = int(shape[1]) - else: - b = torch.randn(shape[1], dtype=dtype, device=device) - if shape[0] == 100000: - a = torch.as_strided(a, (8192, 8192), (20000, 2)) - b = torch.as_strided(b, (8192, 8192), (20000, 2)) - - # warm up - for i in range(10): - output = a + b - - # go - print( - "shape:", - (shape[0], shape[1]), - "; datatype:", - dtype, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - output = a + b - print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100)) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() +def Add(a, b, device): + output = a + b + + +def run_profile(a, b, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = a + b - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Add(a, b, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(a, b, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Add(a, b, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + a = torch.randn(shape[0], dtype=dtype, device=args.device) + if shape[1] == 0.5: + b = int(shape[1]) + else: + b = torch.randn(shape[1], dtype=dtype, device=args.device) + if shape[0] == 100000: + a = torch.as_strided(a, (8192, 8192), (20000, 2)) + b = torch.as_strided(b, (8192, 8192), (20000, 2)) + # warm up + Add(a, b, args.device) + + # go + print( + "shape:", + (shape[0], shape[1]), + "; datatype:", + dtype, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(a, b, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(a, b, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/embedding.py b/test/microbench/embedding.py index 9ee81ad974..a7723a9a72 100644 --- a/test/microbench/embedding.py +++ b/test/microbench/embedding.py @@ -1,43 +1,86 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(1024, 8)] -device = "xpu" backward = True dict_len = 2500000 vect_len = 128 num_iter = 20 -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - emb = torch.nn.Embedding(dict_len, vect_len, dtype=dtype, device=device) - input = torch.randint(0, dict_len, (1024, 8), device=device) - grad = torch.randn(1024, 8, vect_len, dtype=dtype, device=device) - - # warm up - output = emb(input) - output.backward(grad) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = emb(input) - output.backward(grad) - print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100)) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + +def Embedding(input, grad, emb, device): + output = emb(input) + output.backward(grad) + + +def run_profile(input, grad, emb, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = emb(input) - output.backward(grad) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Embedding(input, grad, emb, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, grad, emb, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Embedding(input, grad, emb, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + emb = torch.nn.Embedding( + dict_len, vect_len, dtype=dtype, device=args.device + ) + input = torch.randint(0, dict_len, (1024, 8), device=args.device) + grad = torch.randn(1024, 8, vect_len, dtype=dtype, device=args.device) + # warm up + Embedding(input, grad, emb, args.device) + + # go + print( + "shape:", (shape), "; datatype:", shape, dtype, "; backward:", backward + ) + if not args.e2e_only: + run_profile(input, grad, emb, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, grad, emb, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/embedding_bag.py b/test/microbench/embedding_bag.py index 336c4f3067..f121a75fae 100644 --- a/test/microbench/embedding_bag.py +++ b/test/microbench/embedding_bag.py @@ -1,68 +1,102 @@ +import argparse import random import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +dict_len = 2500000 +vect_len = 128 +batch = 1024 backward = True -num_iter = 20 - -for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for reduce in ["max", "mean", "sum"]: - dict_len = 2500000 - vect_len = 128 - batch = 1024 - - emb = torch.nn.EmbeddingBag( - dict_len, vect_len, mode=reduce, dtype=dtype, device=device - ) - input = torch.empty([batch], dtype=torch.long, device=device) - for i in range(batch): - input[i] = random.randint(0, dict_len - 1) - - bag = torch.empty([batch], dtype=torch.long, device=device) - for i in range(batch): - bag[i] = i - - if backward: - grad = torch.randn(batch, vect_len, dtype=dtype, device=device) - - # warm up - for i in range(5): - output = emb(input, bag) - if backward: - output.backward(grad) - - # go - print( - "shape:", - (batch), - "; datatype:", - dtype, - "; reduce:", - reduce, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(20): - output = emb(input, bag) - if backward: - output.backward(grad) - print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100)) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Embedding_bag(input, bag, grad, emb, backward, device): + output = emb(input, bag) + if backward: + output.backward(grad) + + +def run_profile(input, bag, grad, emb, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = emb(input, bag) + Embedding_bag(input, bag, grad, emb, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, bag, grad, emb, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Embedding_bag(input, bag, grad, emb, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for reduce in ["max", "mean", "sum"]: + input = torch.empty([batch], dtype=torch.long, device=args.device) + emb = torch.nn.EmbeddingBag( + dict_len, vect_len, mode=reduce, dtype=dtype, device=args.device + ) + for i in range(batch): + input[i] = random.randint(0, dict_len - 1) + + bag = torch.empty([batch], dtype=torch.long, device=args.device) + for i in range(batch): + bag[i] = i + if backward: - output.backward(grad) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + grad = torch.randn(batch, vect_len, dtype=dtype, device=args.device) + # warm up + Embedding_bag(input, bag, grad, emb, backward, args.device) + + # go + print( + "shape:", + (batch), + "; datatype:", + dtype, + "; reduce:", + reduce, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, bag, grad, emb, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, bag, grad, emb, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/flip.py b/test/microbench/flip.py index 3dd13ad73c..2914731ae9 100644 --- a/test/microbench/flip.py +++ b/test/microbench/flip.py @@ -1,10 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [ ((64, 1024, 1024), (0, 1)), ((1024, 64, 1024), (0, 2)), @@ -13,52 +12,86 @@ ((16, 128, 512, 512), (0, 3)), ((16, 128, 512, 512), (1, 3)), ] - backward = True -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape[0], device=device, dtype=dtype) - - if backward: - input.requires_grad_(True) - - # warm up - output = torch.flip(input, shape[1]) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - - # go - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; dim:", - shape[1], - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.flip(input, shape[1]) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Flip(input, shape, backward, device): + output = torch.flip(input, shape[1]) + if backward: + gy = torch.empty_like(output) + output.backward(gy) + + +def run_profile(input, shape, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = torch.flip(input, shape[1]) + Flip(input, shape, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, shape, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Flip(input, shape, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape[0], device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + # warm up + Flip(input, shape, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; dim:", + shape[1], + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, shape, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, shape, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/grid_sampler.grid_sampler_2d.py b/test/microbench/grid_sampler.grid_sampler_2d.py index 2dc3e7332d..c8ba070e5e 100644 --- a/test/microbench/grid_sampler.grid_sampler_2d.py +++ b/test/microbench/grid_sampler.grid_sampler_2d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,81 +9,133 @@ (4, 32, 128, 128), (16, 128, 512, 512), ] - -device = "xpu" backward = True -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for mode in ["bilinear", "nearest", "bicubic"]: - for padding_mode in ["zeros", "border", "reflection"]: - for align_corners in [True, False]: - N, C, H, W = shape - - input = torch.randn(N, C, H, W, dtype=dtype, device=device) - grid = torch.randn(N, H, W, 2, dtype=dtype, device=device) - - if backward: - input.requires_grad_(True) - grid.requires_grad_(True) - - # warm up - output = torch.nn.functional.grid_sample( - input, - grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, - ) - if backward: - output.sum().backward() - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; padding_mode:", - padding_mode, - "; align_corners:", - align_corners, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.nn.functional.grid_sample( - input, - grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, - ) - if backward: - output.sum().backward() - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.nn.functional.grid_sample( + + +def Grad_sample2d(input, grid, backward, mode, padding_mode, align_corners, device): + output = torch.nn.functional.grid_sample( + input, + grid, + mode=mode, + padding_mode=padding_mode, + align_corners=align_corners, + ) + if backward: + output.sum().backward() + + +def run_profile( + input, grid, backward, mode, padding_mode, align_corners, device, num_iter +): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Grad_sample2d( + input, grid, backward, mode, padding_mode, align_corners, device + ) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, grid, backward, mode, padding_mode, align_corners, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Grad_sample2d(input, grid, backward, mode, padding_mode, align_corners, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for mode in ["bilinear", "nearest", "bicubic"]: + for padding_mode in ["zeros", "border", "reflection"]: + for align_corners in [True, False]: + N, C, H, W = shape + input = torch.randn(N, C, H, W, dtype=dtype, device=args.device) + grid = torch.randn(N, H, W, 2, dtype=dtype, device=args.device) + + if backward: + input.requires_grad_(True) + grid.requires_grad_(True) + + # warm up + Grad_sample2d( input, grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, + backward, + mode, + padding_mode, + align_corners, + args.device, ) - if backward: - output.sum().backward() - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; mode:", + mode, + "; padding_mode:", + padding_mode, + "; align_corners:", + align_corners, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, + grid, + backward, + mode, + padding_mode, + align_corners, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + grid, + backward, + mode, + padding_mode, + align_corners, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/grid_sampler.grid_sampler_3d.py b/test/microbench/grid_sampler.grid_sampler_3d.py index 41a296fb90..b7841b8847 100644 --- a/test/microbench/grid_sampler.grid_sampler_3d.py +++ b/test/microbench/grid_sampler.grid_sampler_3d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -7,80 +8,136 @@ (2, 5, 6, 3, 5), (8, 16, 64, 64, 64), ] - -device = "xpu" backward = True -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for mode in ["bilinear", "nearest"]: - for padding_mode in ["zeros", "border", "reflection"]: - for align_corners in [True, False]: - N, C, D, H, W = shape - input = torch.randn(N, C, D, H, W, dtype=dtype, device=device) - grid = torch.randn(N, D, H, W, 3, dtype=dtype, device=device) - - if backward: - input.requires_grad_(True) - grid.requires_grad_(True) - - # warm up - output = torch.nn.functional.grid_sample( - input, - grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, - ) - if backward: - output.sum().backward() - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; padding_mode:", - padding_mode, - "; align_corners:", - align_corners, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.nn.functional.grid_sample( - input, - grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, - ) - if backward: - output.sum().backward() - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.nn.functional.grid_sample( + + +def Grad_sample3d(input, grid, backward, mode, padding_mode, align_corners, device): + output = torch.nn.functional.grid_sample( + input, + grid, + mode=mode, + padding_mode=padding_mode, + align_corners=align_corners, + ) + if backward: + output.sum().backward() + + +def run_profile( + input, grid, backward, mode, padding_mode, align_corners, device, num_iter +): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Grad_sample3d( + input, grid, backward, mode, padding_mode, align_corners, device + ) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, grid, backward, mode, padding_mode, align_corners, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Grad_sample3d(input, grid, backward, mode, padding_mode, align_corners, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for mode in ["bilinear", "nearest"]: + for padding_mode in ["zeros", "border", "reflection"]: + for align_corners in [True, False]: + N, C, D, H, W = shape + input = torch.randn( + N, C, D, H, W, dtype=dtype, device=args.device + ) + grid = torch.randn( + N, D, H, W, 3, dtype=dtype, device=args.device + ) + + if backward: + input.requires_grad_(True) + grid.requires_grad_(True) + # warm up + Grad_sample3d( input, grid, - mode=mode, - padding_mode=padding_mode, - align_corners=align_corners, + backward, + mode, + padding_mode, + align_corners, + args.device, ) - if backward: - output.sum().backward() - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; mode:", + mode, + "; padding_mode:", + padding_mode, + "; align_corners:", + align_corners, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, + grid, + backward, + mode, + padding_mode, + align_corners, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + grid, + backward, + mode, + padding_mode, + align_corners, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/group_norm.py b/test/microbench/group_norm.py index ef02aaa749..74efcfd4c4 100644 --- a/test/microbench/group_norm.py +++ b/test/microbench/group_norm.py @@ -1,11 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 shape_list = [ (1, 32, 128, 32, 32), # all channel for 1 group (16, 1024, 128, 32, 32), # normal shape, big memory @@ -13,79 +11,113 @@ (32, 32, 512, 256, 256), # group_num=32, channel for per group=16,big memory (8, 32, 32, 16, 64, 64), # 3d ] +backward = True -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for channels_last in [False, True]: - for affine in [False, True]: - num_groups = shape[0] - shape_input = (shape[1], shape[2], shape[3], shape[4]) - C = shape[2] - memory_format = ( - torch.channels_last_3d - if len(shape_input) == 5 - else torch.channels_last - ) - - if channels_last: - input = ( - torch.randn(shape_input) - .to(memory_format=memory_format) - .to(device=device, dtype=dtype) - ) - else: - input = torch.randn(shape_input).to(device=device, dtype=dtype) +def Group_norm(input, m, backward, device): + output = m(input) + if backward: + grad_out = torch.randn_like(output).to(device) + (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out) - if backward: - input.requires_grad_(True) - m = torch.nn.GroupNorm(num_groups, C, affine=affine, dtype=dtype).to( - device - ) +def run_profile(input, m, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Group_norm(input, m, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) - # warm up - for i in range(5): - output = m(input) + +def run_e2e(input, m, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Group_norm(input, m, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for channels_last in [False, True]: + for affine in [False, True]: + num_groups = shape[0] + shape_input = (shape[1], shape[2], shape[3], shape[4]) + C = shape[2] + memory_format = ( + torch.channels_last_3d + if len(shape_input) == 5 + else torch.channels_last + ) + + if channels_last: + input = ( + torch.randn(shape_input) + .to(memory_format=memory_format) + .to(device=args.device, dtype=dtype) + ) + else: + input = torch.randn(shape_input).to( + device=args.device, dtype=dtype + ) if backward: - grad_out = torch.randn_like(output).to(device) - (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out) - - # go - print( - "shape:", - (shape_input), - "; datatype:", - dtype, - "; channels_last:", - channels_last, - "; affine:", - affine, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = m(input) - - if backward: - grad_out = torch.randn_like(output).to(device) - (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = m(input) - if backward: - grad_out = torch.randn_like(output).to(device) - (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + + m = torch.nn.GroupNorm( + num_groups, C, affine=affine, dtype=dtype + ).to(args.device) + # warm up + Group_norm(input, m, backward, args.device) + + # go + print( + "shape:", + (shape[1], shape[2], shape[3], shape[4]), + "; datatype:", + dtype, + "; channels_last:", + channels_last, + "; affine:", + affine, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, m, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, m, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/im2col.py b/test/microbench/im2col.py index 5af56d285e..e1b9a6b43c 100644 --- a/test/microbench/im2col.py +++ b/test/microbench/im2col.py @@ -1,52 +1,84 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" - shape_list = [(1, 3, 1200, 1200), (1, 3, 224, 224), (1, 3, 63, 1200), (1, 3, 1200, 63)] kernel_size = (7, 7) dilation = (6, 6) -num_iter = 20 backward = True -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device, requires_grad=backward) - - # warmup - output = torch.nn.functional.unfold( - input, kernel_size, dilation=dilation, padding=1, stride=1 - ) - if backward: - torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - output = torch.nn.functional.unfold( - input, kernel_size, dilation=dilation, padding=1, stride=1 - ) - if backward: - torch.autograd.grad( - output, input, grad_outputs=torch.ones_like(output) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + +def Im2col(input, kernel_size, backward, device): + output = torch.nn.functional.unfold( + input, kernel_size, dilation=dilation, padding=1, stride=1 + ) + if backward: + torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) + + +def run_profile(input, kernel_size, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = torch.nn.functional.unfold( - input, kernel_size, dilation=dilation, padding=1, stride=1 + Im2col(input, kernel_size, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, kernel_size, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Im2col(input, kernel_size, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn( + shape, dtype=dtype, device=args.device, requires_grad=backward ) - if backward: - torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output)) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + # warmup + Im2col(input, kernel_size, backward, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, kernel_size, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, kernel_size, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.diag.py b/test/microbench/indexing.diag.py index 45ff879c96..6ba9ab625e 100644 --- a/test/microbench/indexing.diag.py +++ b/test/microbench/indexing.diag.py @@ -1,41 +1,80 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8192), (8192, 8192)] -device = "xpu" backward = False -num_iter = 20 - -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - - # warm up - output = torch.diag(input) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - output = torch.diag(input) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Diag(input, device): + output = torch.diag(input) + + +def run_profile(input, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w * i - output = torch.diag(input) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Diag(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Diag(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + # warm up + Diag(input, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, cache_r, cache_w, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, cache_r, cache_w, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.index.py b/test/microbench/indexing.index.py index 5827cabfb8..86305f4a55 100644 --- a/test/microbench/indexing.index.py +++ b/test/microbench/indexing.index.py @@ -1,88 +1,107 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(4, 15000)] -device = "xpu" backward = False -num_iter = 20 -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for mode in ["with_nonzero", "without_nonzero"]: - d = torch.rand(shape, dtype=dtype, device=device) - e = torch.rand(shape, dtype=dtype, device=device) - if mode == "with_nonzero": - # warm up - for i in range(100): - f = d < e - g = e[f] +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + parser.add_argument("--num_iter", type=int, default=20, help="Number of iterations") + parser.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + parser.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - f = d < e - g = e[f] - print(prof.key_averages().table(sort_by="xpu_time_total")) + args = parser.parse_args() + return args - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): + +def benchmark(shape, dtype, mode, device, num_iter, do_profile, do_e2e): + d = torch.rand(shape, dtype=dtype, device=device) + e = torch.rand(shape, dtype=dtype, device=device) + f = d < e + g = e[f] + + # warm up + if mode == "with_nonzero": + for i in range(100): + f = d < e + g = e[f] + else: + f = torch.linspace(0, 4 - 2, steps=int(4 / 2), device=device).to(torch.long) + for i in range(100): + g = e[f] + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; mode:", + mode, + "; backward:", + backward, + ) + if not do_e2e: + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + if mode == "with_nonzero": f = d < e g = e[f] - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") - else: - f = torch.linspace(0, 4 - 2, steps=int(4 / 2), device=device).to( - torch.long - ) - # warm up - for i in range(100): + else: g = e[f] + print(prof.key_averages().table(sort_by="xpu_time_total")) + + # E2E time + if not do_profile: + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + if mode == "with_nonzero": + df = d < e + g = e[f] + else: + g = e[f] + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; backward:", - backward, + +def main(): + args = parse_args() + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for mode in ["with_nonzero", "without_nonzero"]: + benchmark( + shape=shape, + dtype=dtype, + mode=mode, + device=args.device, + num_iter=args.num_iter, + do_profile=args.profile_only, + do_e2e=args.e2e_only, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - g = e[f] - print(prof.key_averages().table(sort_by="xpu_time_total")) - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - g = e[f] - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + +if __name__ == "__main__": + main() diff --git a/test/microbench/indexing.index_add.py b/test/microbench/indexing.index_add.py index efc0ee9a07..7438f5543c 100644 --- a/test/microbench/indexing.index_add.py +++ b/test/microbench/indexing.index_add.py @@ -1,61 +1,120 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(1024, 1024)] -device = "xpu" backward = False -num_iter = 20 step = int(1024 / 2) -cache_r = torch.randn((8192 * 8192), device=device) -cache_w = torch.randn((8192 * 8192), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 1]: - input = torch.zeros(shape, dtype=dtype, device=device) - indices = torch.linspace(0, 1022, steps=step, device=device).to(torch.long) - y_0 = torch.ones((512, 1024), dtype=dtype, device=device) - y_1 = torch.randn((1024, 512), dtype=dtype, device=device) - - # warm up - for i in range(10): - output = input.index_add(0, indices, y_0) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_add(dim, indices, y_0) - else: - output = input.index_add(dim, indices, y_1) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_add(dim, indices, y_0) - else: - output = input.index_add(dim, indices, y_1) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + +def Index_add(input, indices, y_0, y_1, dim, device): + if dim == 0: + output = input.index_add(dim, indices, y_0) + else: + output = input.index_add(dim, indices, y_1) + + +def run_profile(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w * i + Index_add(input, indices, y_0, y_1, dim, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Index_add(input, indices, y_0, y_1, dim, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 1]: + input = torch.zeros(shape, dtype=dtype, device=args.device) + indices = torch.linspace(0, 1022, steps=step, device=args.device).to( + torch.long + ) + y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device) + y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device) + cache_r = torch.randn((8192 * 8192), device=args.device) + cache_w = torch.randn((8192 * 8192), device=args.device) + # warm up + Index_add(input, indices, y_0, y_1, dim, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + dim, + ) + if not args.e2e_only: + run_profile( + input, + indices, + y_0, + y_1, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + indices, + y_0, + y_1, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.index_copy.py b/test/microbench/indexing.index_copy.py index b06984e321..05b17cc816 100644 --- a/test/microbench/indexing.index_copy.py +++ b/test/microbench/indexing.index_copy.py @@ -1,60 +1,119 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(1024, 1024)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 1]: - input = torch.zeros(shape, dtype=dtype, device=device) - indices = torch.linspace(0, 1022, steps=512, device=device).to(torch.long) - y_0 = torch.ones((512, 1024), dtype=dtype, device=device) - y_1 = torch.randn((1024, 512), dtype=dtype, device=device) - - # warm up - for i in range(10): - output = input.index_copy(0, indices, y_0) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_copy(dim, indices, y_0) - else: - output = input.index_copy(dim, indices, y_1) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_copy(dim, indices, y_0) - else: - output = input.index_copy(dim, indices, y_1) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + +def Index_copy(input, indices, y_0, y_1, dim, device): + if dim == 0: + output = input.index_copy(dim, indices, y_0) + else: + output = input.index_copy(dim, indices, y_1) + + +def run_profile(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w * i + Index_copy(input, indices, y_0, y_1, dim, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Index_copy(input, indices, y_0, y_1, dim, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 1]: + input = torch.zeros(shape, dtype=dtype, device=args.device) + indices = torch.linspace(0, 1022, steps=512, device=args.device).to( + torch.long + ) + y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device) + y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Index_copy(input, indices, y_0, y_1, dim, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, + indices, + y_0, + y_1, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + indices, + y_0, + y_1, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.index_fill.py b/test/microbench/indexing.index_fill.py index 3dd6309692..fbda4276c4 100644 --- a/test/microbench/indexing.index_fill.py +++ b/test/microbench/indexing.index_fill.py @@ -1,60 +1,114 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(1024, 1024)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 1]: - input = torch.zeros(shape, dtype=dtype, device=device) - indices = torch.linspace(0, 1022, steps=512, device=device).to(torch.long) - y_0 = torch.ones((512, 1024), dtype=dtype, device=device) - y_1 = torch.randn((1024, 512), dtype=dtype, device=device) - - # warm up - for i in range(10): - output = input.index_fill(0, indices, 1) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_fill(dim, indices, 1) - else: - output = input.index_fill(dim, indices, 2) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w * i - if dim == 0: - output = input.index_fill(dim, indices, 1) - else: - output = input.index_fill(dim, indices, 2) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + + +def Index_fill(input, indices, dim, device): + if dim == 0: + output = input.index_fill(dim, indices, 1) + else: + output = input.index_fill(dim, indices, 2) + + +def run_profile(input, indices, dim, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w * i + Index_fill(input, indices, dim, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, dim, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Index_fill(input, indices, dim, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 1]: + input = torch.zeros(shape, dtype=dtype, device=args.device) + indices = torch.linspace(0, 1022, steps=512, device=args.device).to( + torch.long + ) + y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device) + y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + # warm up + Index_fill(input, indices, dim, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, + indices, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + indices, + dim, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.index_put.py b/test/microbench/indexing.index_put.py index b0ea89e954..8306b3b0a4 100644 --- a/test/microbench/indexing.index_put.py +++ b/test/microbench/indexing.index_put.py @@ -1,88 +1,107 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(4, 15000)] -device = "xpu" backward = False -num_iter = 20 -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for mode in ["with_nonzero", "without_nonzero"]: - d = torch.rand(4, 15000, dtype=dtype, device=device) - e = torch.rand(4, 15000, dtype=dtype, device=device) - f = d < e - g = e[f] - if mode == "with_nonzero": - # warm up - for i in range(100): - d[f] = g +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + parser.add_argument("--num_iter", type=int, default=20, help="Number of iterations") + parser.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + parser.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + + args = parser.parse_args() + return args - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - d[f] = g - print(prof.key_averages().table(sort_by="xpu_time_total")) - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): +def benchmark(shape, dtype, mode, device, num_iter, do_profile, do_e2e): + d = torch.rand(4, 15000, dtype=dtype, device=device) + e = torch.rand(4, 15000, dtype=dtype, device=device) + f = d < e + g = e[f] + + # warm up + if mode == "with_nonzero": + for i in range(100): + d[f] = g + else: + f = f.nonzero() + index = [] + for i in range(f.dim()): + index.append(f.select(1, i)) + for i in range(100): + d[index] = g + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; mode:", + mode, + "; backward:", + backward, + ) + if not do_e2e: + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + if mode == "with_nonzero": d[f] = g - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") - else: - f = f.nonzero() - index = [] - for i in range(f.dim()): - index.append(f.select(1, i)) - # warm up - for i in range(100): + else: d[index] = g + print(prof.key_averages().table(sort_by="xpu_time_total")) - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; mode:", - mode, - "; backward:", - backward, + # E2E time + if not do_profile: + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + if mode == "with_nonzero": + d[f] = g + else: + d[index] = g + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def main(): + args = parse_args() + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for mode in ["with_nonzero", "without_nonzero"]: + benchmark( + shape=shape, + dtype=dtype, + mode=mode, + device=args.device, + num_iter=args.num_iter, + do_profile=args.profile_only, + do_e2e=args.e2e_only, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - d[index] = g - print(prof.key_averages().table(sort_by="xpu_time_total")) - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - d[index] = g - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + +if __name__ == "__main__": + main() diff --git a/test/microbench/indexing.index_select.py b/test/microbench/indexing.index_select.py index bb22752429..a953b34d98 100644 --- a/test/microbench/indexing.index_select.py +++ b/test/microbench/indexing.index_select.py @@ -1,46 +1,87 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(1024, 1024), (8192, 8192)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - main_size = shape[0] - step = int(main_size / 2) - input = torch.randn(shape, dtype=dtype, device=device) - indices = torch.linspace(0, shape[0] - 2, steps=step, device=device).to( - torch.long - ) - - # warm up - for i in range(10): - y_0 = torch.index_select(input, 0, indices) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - y_0 = torch.index_select(input, 0, indices) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Index_select(input, indices, device): + y_0 = torch.index_select(input, 0, indices) + + +def run_profile(input, indices, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w * i - y_0 = torch.index_select(input, 0, indices) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Index_select(input, indices, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Index_select(input, indices, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + main_size = shape[0] + step = int(main_size / 2) + input = torch.randn(shape, dtype=dtype, device=args.device) + indices = torch.linspace( + 0, shape[0] - 2, steps=step, device=args.device + ).to(torch.long) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + # warm up + Index_select(input, indices, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile( + input, indices, cache_r, cache_w, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e(input, indices, cache_r, cache_w, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.masked_fill.py b/test/microbench/indexing.masked_fill.py index 9d3eb4864a..bd62c29099 100644 --- a/test/microbench/indexing.masked_fill.py +++ b/test/microbench/indexing.masked_fill.py @@ -1,45 +1,87 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8192, 8192)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.zeros(shape, dtype=dtype, device=device) - masks_ = torch.zeros((8192), dtype=dtype, device=device) - indices = torch.linspace(0, 8190, steps=4096, device=device).to(torch.long) - masks_.index_fill_(0, indices, True) - masks = masks_.to(torch.bool) - - # warm up - for i in range(10): - y_1 = input.masked_fill(mask=masks, value=1) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - y_1 = input.masked_fill(mask=masks, value=1) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Masked_fill(input, masks, device): + y_1 = input.masked_fill(mask=masks, value=1) + + +def run_profile(input, masks, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w * i - y_1 = input.masked_fill(mask=masks, value=1) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Masked_fill(input, masks, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, masks, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Masked_fill(input, masks, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.zeros(shape, dtype=dtype, device=args.device) + masks_ = torch.zeros((8192), dtype=dtype, device=args.device) + indices = torch.linspace(0, 8190, steps=4096, device=args.device).to( + torch.long + ) + masks_.index_fill_(0, indices, True) + masks = masks_.to(torch.bool) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Masked_fill(input, masks, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, masks, cache_r, cache_w, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, masks, cache_r, cache_w, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.put.py b/test/microbench/indexing.put.py index cb19375890..d24765ff71 100644 --- a/test/microbench/indexing.put.py +++ b/test/microbench/indexing.put.py @@ -1,45 +1,101 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8192, 8192)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.zeros(shape, dtype=dtype, device=device) - indices = torch.linspace(0, 8190 * 8190, steps=4096 * 4096, device=device).to( - torch.long - ) - sources = torch.ones((4096, 4096), dtype=dtype, device=device) - - # warm up - for i in range(10): - input.put_(index=indices, source=sources) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - input.put_(index=indices, source=sources) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Put(input, indices, sources, device): + input.put_(index=indices, source=sources) + + +def run_profile(input, indices, sources, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w * i - input.put_(index=indices, source=sources) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Put(input, indices, sources, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, sources, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Put(input, indices, sources, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.zeros(shape, dtype=dtype, device=args.device) + indices = torch.linspace( + 0, 8190 * 8190, steps=4096 * 4096, device=args.device + ).to(torch.long) + sources = torch.ones((4096, 4096), dtype=dtype, device=args.device) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Put(input, indices, sources, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile( + input, + indices, + sources, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + input, + indices, + sources, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/indexing.take.py b/test/microbench/indexing.take.py index 13163631a4..4d6fca2123 100644 --- a/test/microbench/indexing.take.py +++ b/test/microbench/indexing.take.py @@ -1,44 +1,86 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8192, 8192)] -device = "xpu" backward = False -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - indices = torch.linspace(0, 8190 * 8190, steps=4096 * 4096, device=device).to( - torch.long - ) - - # warm up - for i in range(10): - output = torch.take(input, indices) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - output = torch.take(input, indices) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Take(input, indices, device): + output = torch.take(input, indices) + + +def run_profile(input, indices, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w * i - output = torch.take(input, indices) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Take(input, indices, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, indices, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Take(input, indices, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + indices = torch.linspace( + 0, 8190 * 8190, steps=4096 * 4096, device=args.device + ).to(torch.long) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Take(input, indices, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile( + input, indices, cache_r, cache_w, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e(input, indices, cache_r, cache_w, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/layer_norm.py b/test/microbench/layer_norm.py index 67a3ef502f..909185ff25 100644 --- a/test/microbench/layer_norm.py +++ b/test/microbench/layer_norm.py @@ -1,66 +1,97 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 - shape_list = [ ((1, 1024), (1024)), ((2, 4096, 320), (4096, 320)), ((512, 3136, 128), (3136, 128)), ((128, 49, 196, 1024), (49, 196, 1024)), ] +backward = True + +def Layer_norm(input, m, backward, device): + output = m(input) + if backward: + gy = torch.empty_like(output) + output.backward(gy) -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape[0], device=device, dtype=dtype) - - if backward: - input.requires_grad_(True) - - # warm up - m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype) - output = m(input) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - - # go - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; dim:", - shape[1], - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype) - output = m(input) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + +def run_profile(input, m, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype) - output = m(input) + Layer_norm(input, m, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, m, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Layer_norm(input, m, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape[0], device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + + m = torch.nn.LayerNorm(shape[1], device=args.device, dtype=dtype) + # warm up + Layer_norm(input, m, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; dim:", + shape[1], + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, m, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, m, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.binary_cross_entropy.py b/test/microbench/loss.binary_cross_entropy.py index e6e3a942ac..f30217906a 100644 --- a/test/microbench/loss.binary_cross_entropy.py +++ b/test/microbench/loss.binary_cross_entropy.py @@ -1,16 +1,12 @@ +import argparse import time import torch import torch.nn as nn from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 shape_list = [(8733, 8733), (8733, 513), (513, 8733), (8192, 8192)] - -cache_r = torch.randn(1024 * 1024 * 1024, device=device) -cache_w = torch.randn(1024 * 1024 * 1024, device=device) +backward = True def _do_test(loss, input, target, dtype, device): @@ -21,50 +17,107 @@ def _do_test(loss, input, target, dtype, device): return output, grad_inputs -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - M, N = shape[0], shape[1] - input = torch.randn((M, N), requires_grad=True) - target = torch.empty((M, N)).random_(2) - for reduce in ["none", "mean", "sum"]: - loss = nn.BCELoss(reduce=reduce) - m = nn.Sigmoid() - input = m(input).to(dtype=dtype, device=device) - target = target.to(dtype=dtype, device=device) - # warm up +def run_profile( + loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter +): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w + 1 _do_test(loss, input, target, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) - # go - print( - "shape:", - (M, N), - "; datatype:", - dtype, - "; reduce:", - reduce, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w + 1 - output_xpu, grad_input_xpu = _do_test( - loss, input, target, dtype, device - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w + 1 - output_xpu, grad_input_xpu = _do_test( - loss, input, target, dtype, device + +def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w + 1 + _do_test(loss, input, target, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + M, N = shape[0], shape[1] + input = torch.randn((M, N), requires_grad=True) + target = torch.empty((M, N)).random_(2) + cache_r = torch.randn(1024 * 1024 * 1024, device=args.device) + cache_w = torch.randn(1024 * 1024 * 1024, device=args.device) + for reduce in ["none", "mean", "sum"]: + loss = nn.BCELoss(reduce=reduce) + m = nn.Sigmoid() + input = m(input).to(dtype=dtype, device=args.device) + target = target.to(dtype=dtype, device=args.device) + # warm up + _do_test(loss, input, target, dtype, args.device) + + # go + print( + "shape:", + (M, N), + "; datatype:", + dtype, + "; reduce:", + reduce, + "; backward:", + backward, ) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.ctc_loss.py b/test/microbench/loss.ctc_loss.py index a228663352..0026c41992 100644 --- a/test/microbench/loss.ctc_loss.py +++ b/test/microbench/loss.ctc_loss.py @@ -1,80 +1,128 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 # T,N,C,S shape_list = [(32, 32, 32, 16), (128, 128, 128, 128), (8, 8, 4, 8)] +backward = True -def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype): - log_probs_dpcpp = log_probs.to("xpu") - log_probs_dpcpp.requires_grad_(True) - targets_dpcpp = targets.to("xpu") - input_lengths_dpcpp = input_lengths.to("xpu") - target_lengths_dpcpp = target_lengths.to("xpu") - - # warm up +def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward): loss_dpcpp = torch.nn.functional.ctc_loss( - log_probs_dpcpp, targets_dpcpp, input_lengths_dpcpp, target_lengths_dpcpp - ) - loss_dpcpp.backward() - - # go - print( - "shape:", - (shape[0], shape[1], shape[2], shape[3]), - "; datatype:", - dtype, - "; backward:", - backward, + log_probs, targets, input_lengths, target_lengths ) + if backward: + loss_dpcpp.backward() + + +def run_profile( + log_probs, targets, input_lengths, target_lengths, backward, device, num_iter +): with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, ) as prof: for i in range(num_iter): - loss_dpcpp = torch.nn.functional.ctc_loss( - log_probs_dpcpp, - targets_dpcpp, - input_lengths_dpcpp, - target_lengths_dpcpp, - ) - loss_dpcpp.backward() - print(prof.key_averages().table(sort_by="xpu_time_total")) + _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + - # E2E time - torch.xpu.synchronize() +def run_e2e( + log_probs, targets, input_lengths, target_lengths, backward, device, num_iter +): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t1 = time.time() for i in range(num_iter): - loss_dpcpp = torch.nn.functional.ctc_loss( - log_probs_dpcpp, - targets_dpcpp, - input_lengths_dpcpp, - target_lengths_dpcpp, - ) - loss_dpcpp.backward() - torch.xpu.synchronize() + _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t2 = time.time() e2e_time = (t2 - t1) / num_iter print("E2E total time:", f"{float(e2e_time):.20f}") -for shape in shape_list: - for dtype in [torch.float32]: - T, N, C, S = shape[0], shape[1], shape[2], shape[3] - g_cpu = torch.Generator() - g_cpu.manual_seed(15) - torch.manual_seed(15) - log_probs = ( - torch.randn(T, N, C, dtype=dtype).log_softmax(2).detach().requires_grad_() - ) - targets = torch.randint(1, N, (N, S), dtype=torch.long, generator=g_cpu) - input_lengths = torch.full((N,), T, dtype=torch.long) - target_lengths = torch.randint(1, S, (N,), dtype=torch.long, generator=g_cpu) - _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype) - g_cpu = torch.Generator() - g_cpu.manual_seed(15) - torch.manual_seed(15) +def benchmark(args): + for shape in shape_list: + for dtype in [torch.float32]: + T, N, C, S = shape[0], shape[1], shape[2], shape[3] + g_cpu = torch.Generator() + g_cpu.manual_seed(15) + torch.manual_seed(15) + log_probs = ( + torch.randn(T, N, C, dtype=dtype, device=args.device) + .log_softmax(2) + .detach() + .requires_grad_() + ) + targets = torch.randint(1, N, (N, S), dtype=torch.long, device=args.device) + input_lengths = torch.full((N,), T, dtype=torch.long, device=args.device) + target_lengths = torch.randint( + 1, S, (N,), dtype=torch.long, device=args.device + ) + + if backward: + log_probs.requires_grad_(True) + + # warm up + _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward) + # go + print( + "shape:", + (shape[0], shape[1], shape[2], shape[3]), + "; datatype:", + dtype, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + log_probs, + targets, + input_lengths, + target_lengths, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + log_probs, + targets, + input_lengths, + target_lengths, + backward, + args.device, + args.num_iter, + ) + g_cpu = torch.Generator() + g_cpu.manual_seed(15) + torch.manual_seed(15) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.l1_loss.py b/test/microbench/loss.l1_loss.py index 3d02e097fb..06d228ed12 100644 --- a/test/microbench/loss.l1_loss.py +++ b/test/microbench/loss.l1_loss.py @@ -1,68 +1,123 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 shape_list = [ (8732, 8732), (8192, 8732), ] +backward = True -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for reduce in ["none", "mean"]: - for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - B = shape[0] - S = shape[1] - input = torch.randn((B, S), requires_grad=True).to( - dtype=dtype, device=device - ) - target = torch.randn((B, S)).to(dtype=dtype, device=device) - loss = torch.nn.L1Loss(reduction=reduce) - - # warm up - output_xpu = loss(input, target) - output_xpu.backward(torch.ones_like(output_xpu, dtype=dtype, device=device)) - - # go - print( - "shape:", - (B, S), - "; datatype:", - dtype, - "; backward:", - backward, - "; reduce: 0" if (reduce == "none") else "; reduce: 1", - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - output_xpu = loss(input, target) - cache_r = cache_w * i - output_xpu.backward( - torch.ones_like(output_xpu, dtype=dtype, device=device) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w * i - output_xpu = loss(input, target) - cache_r = cache_w * i - output_xpu.backward( - torch.ones_like(output_xpu, dtype=dtype, device=device) + +def L1_loss(loss, input, target, dtype, backward, device): + output_xpu = loss(input, target) + if backward: + output_xpu.backward(torch.ones_like(output_xpu, dtype=dtype, device=device)) + + +def run_profile( + loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter +): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w * i + L1_loss(loss, input, target, dtype, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + L1_loss(loss, input, target, dtype, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for reduce in ["none", "mean"]: + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + B = shape[0] + S = shape[1] + input = torch.randn((B, S), requires_grad=True).to( + dtype=dtype, device=args.device ) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + target = torch.randn((B, S)).to(dtype=dtype, device=args.device) + loss = torch.nn.L1Loss(reduction=reduce) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + L1_loss(loss, input, target, dtype, backward, args.device) + + # go + print( + "shape:", + (B, S), + "; datatype:", + dtype, + "; backward:", + backward, + "; reduce: 0" if (reduce == "none") else "; reduce: 1", + ) + if not args.e2e_only: + run_profile( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.mse_loss.py b/test/microbench/loss.mse_loss.py index ae4a5394f2..f5d0f2b915 100644 --- a/test/microbench/loss.mse_loss.py +++ b/test/microbench/loss.mse_loss.py @@ -1,68 +1,119 @@ +import argparse import time import torch import torch.nn as nn from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True shape_list = [(8192, 8192)] -num_iter = 20 - -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - +backward = True -def _do_test(loss, input, target, dtype, device): - input = input.to(dtype=dtype, device=device) - target = target.to(dtype=dtype, device=device) +def Mse_loss(loss, input, target, dtype, device): output = loss(input, target) grad_output = torch.ones_like(output, dtype=dtype, device=device) - grad_inputs = torch.autograd.grad(output, input, grad_output) - - # warm up - output = loss(input, target) output.backward(grad_output) - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; backward:", - backward, - "; reduce: 0" if (reduce == "none") else "; reduce: 1", - ) + +def run_profile( + loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter +): with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, ) as prof: for i in range(num_iter): cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(grad_output) - print(prof.key_averages().table(sort_by="xpu_time_total")) + Mse_loss(loss, input, target, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + - # E2E time - torch.xpu.synchronize() +def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t1 = time.time() for i in range(num_iter): cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(grad_output) - torch.xpu.synchronize() + Mse_loss(loss, input, target, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t2 = time.time() e2e_time = (t2 - t1) / num_iter print("E2E total time:", f"{float(e2e_time):.20f}") -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for reduce in ["none", "mean"]: - input = torch.randn(shape, requires_grad=True) - target = torch.randn(shape) - loss = nn.MSELoss(reduction=reduce) - _do_test(loss, input, target, dtype, device) +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for reduce in ["none", "mean"]: + input = torch.randn(shape, requires_grad=True) + target = torch.randn(shape) + input = input.to(dtype=dtype, device=args.device) + target = target.to(dtype=dtype, device=args.device) + loss = nn.MSELoss(reduction=reduce) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Mse_loss(loss, input, target, dtype, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; backward:", + backward, + "; reduce: 0" if (reduce == "none") else "; reduce: 1", + ) + if not args.e2e_only: + run_profile( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + target, + dtype, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.multilabel_margin_loss.py b/test/microbench/loss.multilabel_margin_loss.py index 85609d6785..9da9b5ce3f 100644 --- a/test/microbench/loss.multilabel_margin_loss.py +++ b/test/microbench/loss.multilabel_margin_loss.py @@ -1,104 +1,119 @@ +import argparse import time import torch import torch.nn as nn from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True shape_list = [(8192, 8192)] -num_iter = 20 +backward = True -cache_r = torch.randn((1024 * 1024 * 1024), device="xpu") -cache_w = torch.randn((1024 * 1024 * 1024), device="xpu") +def Margin_loss(loss, input, target, reduce, dtype, device): + output = loss(input, target) + if reduce == "none": + output.backward(torch.ones_like(output, dtype=dtype).to(device)) + else: + output.backward(torch.tensor((1.0), dtype=dtype).to(device)) -def _test_dpcpp(input, target, reduce, dtype): - loss = nn.MultiLabelMarginLoss(reduction=reduce) - input.requires_grad = True - if reduce == "none": - # warm up - output = loss(input, target) - output.backward(torch.ones_like(output, dtype=dtype).to("xpu")) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; backward:", - backward, - "; reduce: 0" if (reduce == "none") else "; reduce: 1", - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(torch.ones_like(output, dtype=dtype).to("xpu")) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() +def run_profile(loss, input, target, reduce, dtype, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(torch.ones_like(output, dtype=dtype).to("xpu")) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Margin_loss(loss, input, target, reduce, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) - else: - # warm up - output = loss(input, target) - output.backward(torch.tensor((1.0), dtype=dtype).to("xpu")) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; backward:", - backward, - "; reduce: 0" if (reduce == "none") else "; reduce: 1", - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(torch.tensor((1.0), dtype=dtype).to("xpu")) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w - output = loss(input, target) - cache_r = cache_w - output.backward(torch.tensor((1.0), dtype=dtype).to("xpu")) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") - - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for reduce in ["none", "mean"]: - input = torch.randn(shape, dtype=dtype) - target = torch.randn(shape, dtype=dtype).long() - input_dpcpp = input.to("xpu") - target_dpcpp = target.to("xpu") - _test_dpcpp(input_dpcpp, target_dpcpp, reduce, dtype) + +def run_e2e(loss, input, target, reduce, dtype, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w + Margin_loss(loss, input, target, reduce, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for reduce in ["none", "mean"]: + input = torch.randn(shape, dtype=dtype, device=args.device) + target = torch.randn(shape, dtype=dtype, device=args.device).long() + input.requires_grad = True + loss = nn.MultiLabelMarginLoss(reduction=reduce) + + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Margin_loss(loss, input, target, reduce, dtype, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; backward:", + backward, + "; reduce: 0" if (reduce == "none") else "; reduce: 1", + ) + if not args.e2e_only: + run_profile( + loss, + input, + target, + reduce, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + target, + reduce, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.nll_loss.py b/test/microbench/loss.nll_loss.py index 6ee7ed9a9f..8484c15c76 100644 --- a/test/microbench/loss.nll_loss.py +++ b/test/microbench/loss.nll_loss.py @@ -1,51 +1,110 @@ +import argparse import time import torch import torch.nn.functional as F from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True shape_list = [(8192, 8192)] -num_iter = 20 - -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape).to(device).to(dtype) - target = torch.empty(shape[0], dtype=torch.long).to(device) - for i in range(8192): - target[i] = i - x = torch.tensor(0.5).to(device).to(dtype) - input.requires_grad = True - - # warm up - output = F.nll_loss(input, target) - output.backward(x) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - cache_r = cache_w - output = F.nll_loss(input, target) - cache_r = cache_w - output.backward(x) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() +backward = True + + +def Nll_loss(loss, input, x, target, dtype, device): + output = loss(input, target) + output.backward(x) + + +def run_profile(loss, input, x, target, dtype, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): cache_r = cache_w - output = F.nll_loss(input, target) - cache_r = cache_w - output.backward(x) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Nll_loss(loss, input, x, target, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(loss, input, x, target, dtype, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w + Nll_loss(loss, input, x, target, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape).to(args.device).to(dtype) + target = torch.empty(shape[0], dtype=torch.long).to(args.device) + for i in range(8192): + target[i] = i + input.requires_grad = True + loss = F.nll_loss + x = torch.tensor(0.5).to(args.device).to(dtype) + + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Nll_loss(loss, input, x, target, dtype, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile( + loss, + input, + x, + target, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + x, + target, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/loss.smooth_l1_loss.py b/test/microbench/loss.smooth_l1_loss.py index b71739d9e8..17c1357c62 100644 --- a/test/microbench/loss.smooth_l1_loss.py +++ b/test/microbench/loss.smooth_l1_loss.py @@ -1,64 +1,116 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 shape_list = [(8732, 8732), (8192, 8732)] -cache_r = torch.randn((1024 * 1024 * 1024), device=device) -cache_w = torch.randn((1024 * 1024 * 1024), device=device) - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for reduce in ["none", "mean"]: - B = shape[0] - S = shape[1] - input = torch.randn((B, S), requires_grad=True).to( - dtype=dtype, device=device - ) - target = torch.randn((B, S)).to(dtype=dtype, device=device) - loss = torch.nn.SmoothL1Loss(reduction=reduce) - - # warm up - output = loss(input, target) - output.backward(torch.ones_like(output, dtype=dtype, device=device)) - - # go - print( - "shape:", - (B, S), - "; datatype:", - dtype, - "; backward:", - backward, - "; reduce: 0" if (reduce == "none") else "; reduce: 1", - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - cache_r = cache_w * i - output = loss(input, target) - cache_r = cache_w * i - output.backward( - torch.ones_like(output, dtype=torch.float, device=device) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - cache_r = cache_w * i - output = loss(input, target) - cache_r = cache_w * i - output.backward( - torch.ones_like(output, dtype=torch.float, device=device) +backward = True + + +def Smooth_l1_loss(loss, input, target, dtype, device): + output = loss(input, target) + output.backward(torch.ones_like(output, dtype=dtype, device=device)) + + +def run_profile(loss, input, target, dtype, cache_r, cache_w, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + cache_r = cache_w * i + Smooth_l1_loss(loss, input, target, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(loss, input, target, dtype, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + cache_r = cache_w * i + Smooth_l1_loss(loss, input, target, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for reduce in ["none", "mean"]: + B = shape[0] + S = shape[1] + input = torch.randn((B, S), requires_grad=True).to( + dtype=dtype, device=args.device ) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + target = torch.randn((B, S)).to(dtype=dtype, device=args.device) + loss = torch.nn.SmoothL1Loss(reduction=reduce) + + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm up + Smooth_l1_loss(loss, input, target, dtype, args.device) + + # go + print( + "shape:", + (B, S), + "; datatype:", + dtype, + "; backward:", + backward, + "; reduce: 0" if (reduce == "none") else "; reduce: 1", + ) + if not args.e2e_only: + run_profile( + loss, + input, + target, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + loss, + input, + target, + dtype, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/matmul.py b/test/microbench/matmul.py index f79606b02b..0b23362634 100644 --- a/test/microbench/matmul.py +++ b/test/microbench/matmul.py @@ -1,10 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 shape_list = [ (4, 4096, 50400), (4, 2048, 32000), @@ -17,9 +16,10 @@ (4, 2048, 50272), (4, 1792, 250880), ] +backward = True -def matmul(m, n, k, dtype, backward): +def matmul(m, n, k, dtype, backward, device): m1 = torch.rand(2, m, k).type(dtype).to(device) m2 = torch.rand(k, n).type(dtype).to(device) if backward: @@ -32,29 +32,64 @@ def matmul(m, n, k, dtype, backward): output.backward(gy) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + matmul(shape[0], shape[2], shape[1], dtype, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + matmul(shape[0], shape[2], shape[1], dtype, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: # warm up - matmul(shape[0], shape[2], shape[1], dtype, backward) + matmul(shape[0], shape[2], shape[1], dtype, backward, args.device) # go print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - matmul(shape[0], shape[2], shape[1], dtype, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - matmul(shape[0], shape[2], shape[1], dtype, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(shape, dtype, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(shape, dtype, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pad_sequence.py b/test/microbench/pad_sequence.py index a19d265a33..ab5142fdb3 100644 --- a/test/microbench/pad_sequence.py +++ b/test/microbench/pad_sequence.py @@ -1,70 +1,129 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = False -num_iter = 20 shape_list = [((25, 300), (22, 300), (15, 300)), ((2, 1000), (100, 1000), (8192, 1000))] +backward = False + + +def Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device): + output = torch.nn.utils.rnn.pad_sequence(([a, b, c]), batch_first, padding_value) + if backward: + gy = torch.empty_like(output) + output.backward(gy) + + +def run_profile(a, b, c, batch_first, padding_value, dtype, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(a, b, c, batch_first, padding_value, dtype, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for batch_first in [False, True]: + for padding_value in [0.0, 1.0, 2.0]: + a = torch.randn(shape[0], device=args.device, dtype=dtype) + b = torch.randn(shape[1], device=args.device, dtype=dtype) + c = torch.randn(shape[2], device=args.device, dtype=dtype) -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for batch_first in [False, True]: - for padding_value in [0.0, 1.0, 2.0]: - a = torch.randn(shape[0], device=device, dtype=dtype) - b = torch.randn(shape[1], device=device, dtype=dtype) - c = torch.randn(shape[2], device=device, dtype=dtype) - - if backward: - a.requires_grad_(True) - b.requires_grad_(True) - c.requires_grad_(True) - - # warm up - output = torch.nn.utils.rnn.pad_sequence( - ([a, b, c]), batch_first, padding_value - ) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; batch_first:", - batch_first, - "; padding_value:", - padding_value, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.nn.utils.rnn.pad_sequence( - ([a, b, c]), batch_first, padding_value - ) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.nn.utils.rnn.pad_sequence( - ([a, b, c]), batch_first, padding_value - ) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + a.requires_grad_(True) + b.requires_grad_(True) + c.requires_grad_(True) + + # warm up + Pad_sequence( + a, + b, + c, + batch_first, + padding_value, + dtype, + backward, + args.device, + ) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; batch_first:", + batch_first, + "; padding_value:", + padding_value, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + a, + b, + c, + batch_first, + padding_value, + dtype, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + a, + b, + c, + batch_first, + padding_value, + dtype, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.adaptive_max_pool2d.py b/test/microbench/pooling.adaptive_max_pool2d.py index 5c774d76ac..35ce3610d8 100644 --- a/test/microbench/pooling.adaptive_max_pool2d.py +++ b/test/microbench/pooling.adaptive_max_pool2d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,25 +9,26 @@ (8, 512, 32, 32, (7, 7)), (8, 256, 56, 56, (14, 14)), ] +backward = True -def adaptive_mp2d(shape, dtype, channels_last, backward): +def adaptive_mp2d(shape, dtype, channels_last, backward, device): N, C, H, W, output_size = shape[0], shape[1], shape[2], shape[3], shape[4] if channels_last: input = ( torch.randn(N, C, H, W) .to(memory_format=torch.channels_last) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) Wout = output_size[0] Hout = output_size[1] - grad = torch.randn([N, C, Hout, Wout]).to(device="xpu", dtype=dtype) + grad = torch.randn([N, C, Hout, Wout]).to(device=device, dtype=dtype) adapt_mp2d = torch.nn.AdaptiveMaxPool2d( output_size=(Hout, Wout), return_indices=True @@ -38,14 +40,38 @@ def adaptive_mp2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + adaptive_mp2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + adaptive_mp2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - adaptive_mp2d(shape, dtype, channels_last, backward) + adaptive_mp2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -60,20 +86,44 @@ def adaptive_mp2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - adaptive_mp2d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - adaptive_mp2d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.fractional_max_pool2d.py b/test/microbench/pooling.fractional_max_pool2d.py index b35fc0571b..d49abd640d 100644 --- a/test/microbench/pooling.fractional_max_pool2d.py +++ b/test/microbench/pooling.fractional_max_pool2d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -9,23 +10,24 @@ (1, 3, 1200, 1200, 600, 600), (512, 512, 28, 28, 14, 14), ] +backward = True -def fmp2d(shape, dtype, channels_last, backward): +def fmp2d(shape, dtype, channels_last, backward, device): N, C, H, W, oH, oW = shape[0], shape[1], shape[2], shape[3], shape[4], shape[5] if channels_last: input = ( torch.randn(N, C, H, W) .to(memory_format=torch.channels_last) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) - grad = torch.randn([N, C, oH, oW]).to(device="xpu", dtype=dtype) + grad = torch.randn([N, C, oH, oW]).to(device=device, dtype=dtype) fmp = torch.nn.FractionalMaxPool2d(2, output_size=(oH, oW), return_indices=True) @@ -35,14 +37,38 @@ def fmp2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + fmp2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + fmp2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - fmp2d(shape, dtype, channels_last, backward) + fmp2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -55,20 +81,44 @@ def fmp2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - fmp2d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - fmp2d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.fractional_max_pool3d.py b/test/microbench/pooling.fractional_max_pool3d.py index 26d8921044..1721f80e9a 100644 --- a/test/microbench/pooling.fractional_max_pool3d.py +++ b/test/microbench/pooling.fractional_max_pool3d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,9 +9,10 @@ (1, 3, 144, 144, 144, 72, 72, 72), (512, 512, 12, 12, 12, 6, 6, 6), ] +backward = True -def fmp3d(shape, dtype, channels_last, backward): +def fmp3d(shape, dtype, channels_last, backward, device): N, C, H, W, D, oH, oW, oD = ( shape[0], shape[1], @@ -26,14 +28,14 @@ def fmp3d(shape, dtype, channels_last, backward): input = ( torch.randn(N, C, H, W, D) .to(memory_format=torch.channels_last_3d) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W, D).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W, D).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) - grad = torch.randn([N, C, oH, oW, oD]).to(device="xpu", dtype=dtype) + grad = torch.randn([N, C, oH, oW, oD]).to(device=device, dtype=dtype) fmp = torch.nn.FractionalMaxPool3d(2, output_size=(oH, oW, oD), return_indices=True) @@ -43,14 +45,38 @@ def fmp3d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + fmp3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + fmp3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - fmp3d(shape, dtype, channels_last, backward) + fmp3d(shape, dtype, channels_last, backward, args.device) # go print( @@ -63,20 +89,44 @@ def fmp3d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - fmp3d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - fmp3d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.max_pool2d.py b/test/microbench/pooling.max_pool2d.py index 37e88646ab..540bb64168 100644 --- a/test/microbench/pooling.max_pool2d.py +++ b/test/microbench/pooling.max_pool2d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -9,9 +10,10 @@ (64, 1024, 112, 112, (6), (4)), (16, 2048, 224, 224, (3), (2)), ] +backward = True -def mp2d(shape, dtype, channels_last, backward): +def mp2d(shape, dtype, channels_last, backward, device): N, C, H, W, kernel_size, stride = ( shape[0], shape[1], @@ -25,10 +27,10 @@ def mp2d(shape, dtype, channels_last, backward): input = ( torch.randn(N, C, H, W) .to(memory_format=torch.channels_last) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) @@ -38,7 +40,7 @@ def mp2d(shape, dtype, channels_last, backward): else: Wout = (W - kernel_size[1]) / stride[1] + 1 Hout = (H - kernel_size[0]) / stride[0] + 1 - grad = torch.randn([N, C, int(Hout), int(Wout)]).to(device="xpu", dtype=dtype) + grad = torch.randn([N, C, int(Hout), int(Wout)]).to(device=device, dtype=dtype) mp2d = torch.nn.MaxPool2d(shape[4], stride=shape[5], return_indices=True) @@ -48,14 +50,38 @@ def mp2d(shape, dtype, channels_last, backward): output[0].backward(grad) -if __name__ == "__main__": - backward = True - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + mp2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + mp2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - mp2d(shape, dtype, channels_last, backward) + mp2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -72,20 +98,44 @@ def mp2d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - mp2d(shape, dtype, channels_last, backward) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - mp2d(shape, dtype, channels_last, backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.max_pool3d.py b/test/microbench/pooling.max_pool3d.py index 0563287a7d..2b488d391a 100644 --- a/test/microbench/pooling.max_pool3d.py +++ b/test/microbench/pooling.max_pool3d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,10 +9,10 @@ (1, 4, 144, 144, 144, 72, 72, 72), (512, 512, 12, 12, 12, 6, 6, 6), ] -num_iter = 20 +backward = True -def fmp3d(shape, dtype, channels_last, backward): +def mp3d(shape, dtype, channels_last, backward, device): torch.manual_seed(20) N, C, H, W, D, oH, oW, oD = ( shape[0], @@ -28,59 +29,101 @@ def fmp3d(shape, dtype, channels_last, backward): input = ( torch.randn(N, C, H, W, D) .to(memory_format=torch.channels_last_3d) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W, D).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W, D).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) - grad = torch.randn([N, C, oH, oW, oD]).to(device="xpu", dtype=dtype) + grad = torch.randn([N, C, oH, oW, oD]).to(device=device, dtype=dtype) fmp = torch.nn.MaxPool3d(2, return_indices=True) output = fmp(input) - # warm up - output = fmp(input) - if backward: - output[0].backward(grad) - - # go - print( - "shape:", - (shape[0], shape[1], shape[2], shape[3], shape[4]), - "; datatype:", - dtype, - "; channels_last:", - channels_last, - "; backward:", - backward, - ) + +def run_profile(shape, dtype, channels_last, backward, device, num_iter): with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, ) as prof: for i in range(num_iter): - output = fmp(input) - if backward: - output[0].backward(grad) - print(prof.key_averages().table(sort_by="xpu_time_total")) + mp3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) - # E2E time - torch.xpu.synchronize() + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t1 = time.time() for i in range(num_iter): - output = fmp(input) - if backward: - output[0].backward(grad) - torch.xpu.synchronize() + mp3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t2 = time.time() e2e_time = (t2 - t1) / num_iter print("E2E total time:", f"{float(e2e_time):.20f}") -if __name__ == "__main__": - backward = True +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: - fmp3d(shape, dtype, channels_last, backward=True) + # warm up + mp3d(shape, dtype, channels_last, backward, args.device) + + # go + print( + "shape:", + (shape[0], shape[1], shape[2], shape[3], shape[4]), + "; datatype:", + dtype, + "; channels_last:", + channels_last, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.max_unpool2d.py b/test/microbench/pooling.max_unpool2d.py index 9d0c3388e5..9c48a8c8f7 100644 --- a/test/microbench/pooling.max_unpool2d.py +++ b/test/microbench/pooling.max_unpool2d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,9 +9,10 @@ (4, 65, 128, 128), (8, 128, 128, 128), ] +backward = True -def maxUnpool2d(shape, dtype, device, channels_last, backward): +def maxUnpool2d(shape, dtype, channels_last, backward, device): N, C, H, W = int(shape[0]), int(shape[1]), int(shape[2]), int(shape[3]) kernel_size = 2 @@ -58,15 +60,38 @@ def maxUnpool2d(shape, dtype, device, channels_last, backward): y_dpcpp.backward(grad_dpcpp) -if __name__ == "__main__": - backward = True - device = "xpu" - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + maxUnpool2d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + maxUnpool2d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - maxUnpool2d(shape, dtype, device, channels_last, backward=backward) + maxUnpool2d(shape, dtype, channels_last, backward, args.device) # go print( @@ -81,22 +106,44 @@ def maxUnpool2d(shape, dtype, device, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - maxUnpool2d( - shape, dtype, device, channels_last, backward=backward - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - maxUnpool2d(shape, dtype, device, channels_last, backward=backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/pooling.max_unpool3d.py b/test/microbench/pooling.max_unpool3d.py index 66610eaaf7..91d87d15c1 100644 --- a/test/microbench/pooling.max_unpool3d.py +++ b/test/microbench/pooling.max_unpool3d.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -8,9 +9,10 @@ (4, 33, 64, 64, 64), (16, 32, 32, 32, 32), ] +backward = True -def maxUnpool3d(shape, dtype, device, channels_last, backward): +def maxUnpool3d(shape, dtype, channels_last, backward, device): N, C, D, H, W = ( int(shape[0]), int(shape[1]), @@ -64,15 +66,38 @@ def maxUnpool3d(shape, dtype, device, channels_last, backward): y_dpcpp.backward(grad_dpcpp) -if __name__ == "__main__": - backward = True - device = "xpu" - num_iter = 20 +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + maxUnpool3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + maxUnpool3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - maxUnpool3d(shape, dtype, device, channels_last, backward=backward) + maxUnpool3d(shape, dtype, channels_last, backward, args.device) # go print( @@ -87,22 +112,44 @@ def maxUnpool3d(shape, dtype, device, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - maxUnpool3d( - shape, dtype, device, channels_last, backward=backward - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - maxUnpool3d(shape, dtype, device, channels_last, backward=backward) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/reduce.max.py b/test/microbench/reduce.max.py index cf03fe413a..d20b4d3c09 100644 --- a/test/microbench/reduce.max.py +++ b/test/microbench/reduce.max.py @@ -1,49 +1,89 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 + + +def Max(input, dim, backward, device): + output = torch.max(input, dim) + + +def run_profile(input, dim, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Max(input, dim, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, dim, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Max(input, dim, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + # dim = 1: reduce along contiguous dim # dim = 0: reduce along strided dim -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [1, 0]: - input = torch.randn(8192, 8192, dtype=dtype, device=device) - - # warm up - output = torch.max(input, 1) - output = torch.max(input, 0) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.max(input, dim) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.max(input, dim) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [1, 0]: + input = torch.randn(shape, dtype=dtype, device=args.device) + + # warm up + Max(input, dim, backward, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, dim, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, dim, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/reduce.sum.py b/test/microbench/reduce.sum.py index de092c4ee8..32b8d3eda2 100644 --- a/test/microbench/reduce.sum.py +++ b/test/microbench/reduce.sum.py @@ -1,57 +1,93 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" shape_list = [(8192, 8192)] backward = False -num_iter = 20 + + +def Sum(input, dim, backward, device): + if dim is None: + output = torch.sum(input) + else: + output = torch.sum(input, dim) + + +def run_profile(input, dim, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Sum(input, dim, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, dim, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Sum(input, dim, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + # dim = None: reduce all # dim = 0: reduce along strided dim # dim = 1: reduce along contiguous dim -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [None, 0, 1]: - input = torch.randn(shape, dtype=dtype, device=device) - - # warm up - output = torch.sum(input) - output = torch.sum(input, 0) - output = torch.sum(input, 1) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - if dim is None: - output = torch.sum(input) - else: - output = torch.sum(input, dim) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - if dim is None: - output = torch.sum(input) - else: - output = torch.sum(input, dim) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [None, 0, 1]: + input = torch.randn(shape, dtype=dtype, device=args.device) + + # warm up + Sum(input, dim, backward, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, dim, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, dim, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/remainder.py b/test/microbench/remainder.py index 3a0c79ce32..09b4e9026a 100644 --- a/test/microbench/remainder.py +++ b/test/microbench/remainder.py @@ -1,58 +1,92 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(1024, 1024, 1024), (6, 7, 3, 2), (8193, 8193, 4, 4)] backward = True -num_iter = 20 -shape_list = [(1024, 1024, 1024), (6, 7, 3, 2), (8193, 8193, 4, 4)] -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for divisor in [2, -1.5, 3]: - input = torch.randn(shape, device=device, dtype=dtype) - if backward: - input.requires_grad_(True) - - # warm - output = torch.remainder(input, divisor) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - - # go - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; divisor:", - divisor, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.remainder(input, divisor) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.remainder(input, divisor) +def Remainder(input, divisor, backward, device): + output = torch.remainder(input, divisor) + if backward: + gy = torch.empty_like(output) + output.backward(gy) + + +def run_profile(input, divisor, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Remainder(input, divisor, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, divisor, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Remainder(input, divisor, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for divisor in [2, -1.5, 3]: + input = torch.randn(shape, device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + + # warm + Remainder(input, divisor, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; divisor:", + divisor, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, divisor, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, divisor, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/repeat_interleave.py b/test/microbench/repeat_interleave.py index b0a31207ef..dc228b93c6 100644 --- a/test/microbench/repeat_interleave.py +++ b/test/microbench/repeat_interleave.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -7,58 +8,94 @@ (16, 8, 23), (4, 2048, 2048), ] -device = "xpu" backward = False -num_iter = 20 -for shape in shape_list: - for repeats in [8]: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 2]: - input = torch.randn(shape, device=device, dtype=dtype) - if backward: - input.requires_grad_(True) +def Repeat_interleave(input, repeats, dim, backward, device): + output = torch.repeat_interleave(input, repeats, dim) + if backward: + gy = torch.empty_like(output) + output.backward(gy) - # warm up - for i in range(5): - output = torch.repeat_interleave(input, repeats, dim) + +def run_profile(input, repeats, dim, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Repeat_interleave(input, repeats, dim, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, repeats, dim, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Repeat_interleave(input, repeats, dim, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for repeats in [8]: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 2]: + input = torch.randn(shape, device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - # go - print( - "shape:", - shape, - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = torch.repeat_interleave(input, repeats, dim) - - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = torch.repeat_interleave(input, repeats, dim) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + + # warm up + Repeat_interleave(input, repeats, dim, backward, args.device) + + # go + print( + "shape:", + shape, + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, repeats, dim, backward, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e( + input, repeats, dim, backward, args.device, args.num_iter + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/roll.py b/test/microbench/roll.py index 7680d7e784..28c3e98a07 100644 --- a/test/microbench/roll.py +++ b/test/microbench/roll.py @@ -1,12 +1,9 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = True -num_iter = 20 - shape_list = [ ((1024, 1024, 1024), (-1), (0)), ((1024, 1024, 1024), (128, 128), (-1, 0)), @@ -15,51 +12,89 @@ ((16, 3, 512, 512), (127), (0)), ((16, 3, 512, 512), (127, 127), (0, -1)), ] +backward = True + + +def Roll(input, shape, backward, device): + output = torch.roll(input, shifts=shape[1], dims=shape[2]) + if backward: + gy = torch.empty_like(output) + output.backward(gy) -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape[0], device=device, dtype=dtype) - if backward: - input.requires_grad_(True) - - # warm - output = torch.roll(input, shifts=shape[1], dims=shape[2]) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - - # go - print( - "shape:", - shape[0], - "; datatype:", - dtype, - "; dim:", - shape[2], - "; shifts:", - shape[1], - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - output = torch.roll(input, shifts=shape[1], dims=shape[2]) - if backward: - gy = torch.empty_like(output) - output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + +def run_profile(input, shape, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = torch.roll(input, shifts=shape[1], dims=shape[2]) + Roll(input, shape, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, shape, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Roll(input, shape, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape[0], device=args.device, dtype=dtype) if backward: - gy = torch.empty_like(output) - output.backward(gy) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + input.requires_grad_(True) + + # warm + Roll(input, shape, backward, args.device) + + # go + print( + "shape:", + shape[0], + "; datatype:", + dtype, + "; dim:", + shape[2], + "; shifts:", + shape[1], + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, shape, backward, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, shape, backward, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scan.cumsum.py b/test/microbench/scan.cumsum.py index 944a402a64..7e1699b813 100644 --- a/test/microbench/scan.cumsum.py +++ b/test/microbench/scan.cumsum.py @@ -1,48 +1,87 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(8193, 8193), (1234, 8193), (8192, 1234), (1, 4 * 15000)] backward = False -num_iter = 20 -shape_list = [(8193, 8193), (1234, 8193), (8192, 1234), (1, 4 * 15000)] -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 1]: - input = torch.randn(shape, dtype=dtype, device=device) - - # warm up - torch.cumsum(input, 0) - torch.cumsum(input, 1) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - torch.cumsum(input, 0) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - torch.cumsum(input, 0) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") +def Cumsum(input, dim, device): + torch.cumsum(input, dim) + + +def run_profile(input, dim, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Cumsum(input, dim, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, dim, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Cumsum(input, dim, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 1]: + input = torch.randn(shape, dtype=dtype, device=args.device) + + # warm up + Cumsum(input, dim, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, dim, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, dim, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scan.masked_select.py b/test/microbench/scan.masked_select.py index 6ed1b9cf63..fe46879679 100644 --- a/test/microbench/scan.masked_select.py +++ b/test/microbench/scan.masked_select.py @@ -1,36 +1,77 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(8193, 8193)] -device = "xpu" backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randn(shape, dtype=dtype, device=device) - mask = input.ge(0.5) - # warm up - torch.masked_select(input, mask) - - # go - print("shape:", shape, "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - torch.masked_select(input, mask) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Masked_select(input, mask, device): + torch.masked_select(input, mask) + + +def run_profile(input, mask, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - torch.masked_select(input, mask) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Masked_select(input, mask, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, mask, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Masked_select(input, mask, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randn(shape, dtype=dtype, device=args.device) + mask = input.ge(0.5) + # warm up + Masked_select(input, mask, args.device) + + # go + print("shape:", shape, "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, mask, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, mask, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scan.nonzero.py b/test/microbench/scan.nonzero.py index e2f6afbdad..a98d4b3171 100644 --- a/test/microbench/scan.nonzero.py +++ b/test/microbench/scan.nonzero.py @@ -1,39 +1,80 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity shape_list = [(2047, 2047, 10), (1, 4 * 15000)] -device = "xpu" backward = False -num_iter = 20 - -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - if shape == (2047, 2047, 10): - input = torch.randint(-2, 3, shape, dtype=dtype, device=device) - else: - input = torch.randn(shape, dtype=dtype, device=device) - - # warm up - torch.nonzero(input) - - # go - print("shape:", shape, "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - torch.nonzero(input) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Nonzero(input, device): + torch.nonzero(input) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - torch.nonzero(input) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Nonzero(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Nonzero(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + if shape == (2047, 2047, 10): + input = torch.randint(-2, 3, shape, dtype=dtype, device=args.device) + else: + input = torch.randn(shape, dtype=dtype, device=args.device) + + # warm up + Nonzero(input, args.device) + + # go + print("shape:", shape, "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scan.topk.py b/test/microbench/scan.topk.py index 32e0ee6186..a25270d43d 100644 --- a/test/microbench/scan.topk.py +++ b/test/microbench/scan.topk.py @@ -1,57 +1,94 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -backward = False -num_iter = 20 - shape_list = [(8193, 8193)] +backward = False k = 4096 largest = True sorted = True -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [None, 0, 1]: - input = torch.randn(shape, dtype=dtype, device=device) - # warm up - torch.topk(input, k) - torch.topk(input, k, 0, largest, sorted) - torch.topk(input, k, 1, largest, sorted) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - if dim is None: - torch.topk(input, k) - else: - torch.topk(input, k, dim, largest, sorted) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - if dim is None: - torch.topk(input, k) - else: - torch.topk(input, k, dim, largest, sorted) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + +def Topk(input, dim, k, largest, sorted, device): + if dim is None: + torch.topk(input, k) + else: + torch.topk(input, k, dim, largest, sorted) + + +def run_profile(input, dim, k, largest, sorted, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Topk(input, dim, k, largest, sorted, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, dim, k, largest, sorted, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Topk(input, dim, k, largest, sorted, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [None, 0, 1]: + input = torch.randn(shape, dtype=dtype, device=args.device) + # warm up + Topk(input, dim, k, largest, sorted, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + input, dim, k, largest, sorted, args.device, args.num_iter + ) + + if not args.profile_only: + run_e2e(input, dim, k, largest, sorted, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scan.unique.py b/test/microbench/scan.unique.py index 5b17d7b16e..1a44e36e16 100644 --- a/test/microbench/scan.unique.py +++ b/test/microbench/scan.unique.py @@ -1,40 +1,77 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(2049, 2049)] backward = False -num_iter = 20 -shape_list = [(2049, 2049)] -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - input = torch.randint(100, shape, dtype=dtype, device=device) - - # warm up - torch.unique(input, sorted=True, return_inverse=True, return_counts=True) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - output = torch.unique( - input, sorted=True, return_inverse=True, return_counts=True - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() +def Unique(input, device): + torch.unique(input, sorted=True, return_inverse=True, return_counts=True) + + +def run_profile(input, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - output = torch.unique( - input, sorted=True, return_inverse=True, return_counts=True - ) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Unique(input, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Unique(input, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + input = torch.randint(100, shape, dtype=dtype, device=args.device) + + # warm up + Unique(input, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(input, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scatter_gather.gather.py b/test/microbench/scatter_gather.gather.py index aa2279a800..a1bfc690f3 100644 --- a/test/microbench/scatter_gather.gather.py +++ b/test/microbench/scatter_gather.gather.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -15,47 +16,93 @@ ((10240, 8192), (2560, 8192), 0), ((8192, 10240), (2048, 10240), 0), ] - -device = "xpu" backward = False -num_iter = 20 - -g_xpu = torch.Generator(device=device) -g_xpu.manual_seed(25) -torch.manual_seed(25) -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - shapes = shape[0] - ishapes = shape[1] - dim = shape[2] - a = torch.randn(shapes, dtype=dtype, device=device) - index = torch.randint(1, shapes[dim], ishapes, device=device, generator=g_xpu) - print( - "shape:", - shapes, - "; kernel_size:", - ishapes, - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - torch.gather(a, dim, index) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() + + +def Gather(a, dim, index, device): + torch.gather(a, dim, index) + + +def run_profile(a, dim, index, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - torch.gather(a, dim, index) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Gather(a, dim, index, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(a, dim, index, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Gather(a, dim, index, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + shapes = shape[0] + ishapes = shape[1] + dim = shape[2] + g_xpu = torch.Generator(device=args.device) + g_xpu.manual_seed(25) + torch.manual_seed(25) + a = torch.randn(shapes, dtype=dtype, device=args.device) + index = torch.randint( + 1, shapes[dim], ishapes, device=args.device, generator=g_xpu + ) + + # warm up + Gather(a, dim, index, args.device) + + # go + print( + "shape:", + shapes, + "; kernel_size:", + ishapes, + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(a, dim, index, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(a, dim, index, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scatter_gather.scatter.py b/test/microbench/scatter_gather.scatter.py index 7c5f105d7a..707a390a3a 100644 --- a/test/microbench/scatter_gather.scatter.py +++ b/test/microbench/scatter_gather.scatter.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -19,17 +20,10 @@ ((4096, 8192, 8192), 1), ((4097, 8193, 8193), 1), ] - -device = "xpu" backward = False -num_iter = 20 - -g_xpu = torch.Generator(device=device) -g_xpu.manual_seed(25) -torch.manual_seed(25) -def Scatter(shape, dtype, dim, device): +def Scatter(shape, dtype, dim, g_xpu, device): if dim == 2: m, n, k1, k2 = shape[0][0], shape[0][1], shape[0][2], shape[0][3] src = torch.ones((m, n, k1), dtype=dtype, device=device) @@ -50,12 +44,41 @@ def Scatter(shape, dtype, dim, device): dst = zeros.scatter_(dim, index, src) -if __name__ == "__main__": +def run_profile(shape, dtype, dim, g_xpu, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Scatter(shape, dtype, dim, g_xpu, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, dim, g_xpu, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Scatter(shape, dtype, dim, g_xpu, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: dim = shape[1] + g_xpu = torch.Generator(device=args.device) + g_xpu.manual_seed(25) + torch.manual_seed(25) # warm up - Scatter(shape, dtype, dim, device) + Scatter(shape, dtype, dim, g_xpu, args.device) # go print( @@ -68,20 +91,30 @@ def Scatter(shape, dtype, dim, device): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Scatter(shape, dtype, dim, device) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Scatter(shape, dtype, dim, device) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(shape, dtype, dim, g_xpu, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(shape, dtype, dim, g_xpu, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/scatter_gather.scatter_add.py b/test/microbench/scatter_gather.scatter_add.py index 97835bc093..0e73f86594 100644 --- a/test/microbench/scatter_gather.scatter_add.py +++ b/test/microbench/scatter_gather.scatter_add.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -20,17 +21,10 @@ ((4096, 8192, 8192), 1), ((4097, 8193, 8193), 1), ] - -device = "xpu" backward = False -num_iter = 20 - -g_xpu = torch.Generator(device=device) -g_xpu.manual_seed(25) -torch.manual_seed(25) -def Scatter_add(shape, dtype, dim, device): +def Scatter_add(shape, dtype, dim, g_xpu, device): if dim == 2: m, n, k1, k2 = shape[0][0], shape[0][1], shape[0][2], shape[0][3] src = torch.ones((m, n, k1), dtype=dtype, device=device) @@ -51,12 +45,41 @@ def Scatter_add(shape, dtype, dim, device): dst = zeros.scatter_add_(dim, index, src) -if __name__ == "__main__": +def run_profile(shape, dtype, dim, g_xpu, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Scatter_add(shape, dtype, dim, g_xpu, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, dim, g_xpu, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Scatter_add(shape, dtype, dim, g_xpu, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: dim = shape[1] + g_xpu = torch.Generator(device=args.device) + g_xpu.manual_seed(25) + torch.manual_seed(25) # warm up - Scatter_add(shape, dtype, dim, device) + Scatter_add(shape, dtype, dim, g_xpu, args.device) # go print( @@ -69,20 +92,30 @@ def Scatter_add(shape, dtype, dim, device): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Scatter_add(shape, dtype, dim, device) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Scatter_add(shape, dtype, dim, device) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile(shape, dtype, dim, g_xpu, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(shape, dtype, dim, g_xpu, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/softmax.py b/test/microbench/softmax.py index 08d598e839..4927e1ce25 100644 --- a/test/microbench/softmax.py +++ b/test/microbench/softmax.py @@ -1,56 +1,94 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(8192, 8192), (64, 8192), (8192, 64)] backward = True -num_iter = 20 -shape_list = [(8192, 8192), (64, 8192), (8192, 64)] -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [0, 1]: - H, W = (int(shape[0]), int(shape[1])) - input = torch.randn((H, W)).to(dtype=dtype, device=device) - - softmax = torch.nn.Softmax(dim=dim) - softmax.to(device=device, dtype=dtype) - grad_dpcpp = torch.randn((H, W)).to(device=device, dtype=dtype) - input.requires_grad_(True) - - # warm up - output = softmax(input) - output.backward(grad_dpcpp) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - output = softmax(input) - output.backward(grad_dpcpp) - print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100)) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - output = softmax(input) - output.backward(grad_dpcpp) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") +def Softmax(input, softmax, grad_dpcpp, device): + output = softmax(input) + output.backward(grad_dpcpp) + + +def run_profile(input, softmax, grad_dpcpp, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Softmax(input, softmax, grad_dpcpp, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, softmax, grad_dpcpp, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Softmax(input, softmax, grad_dpcpp, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [0, 1]: + H, W = (int(shape[0]), int(shape[1])) + input = torch.randn((H, W)).to(dtype=dtype, device=args.device) + + softmax = torch.nn.Softmax(dim=dim) + softmax.to(device=args.device, dtype=dtype) + grad_dpcpp = torch.randn((H, W)).to(device=args.device, dtype=dtype) + input.requires_grad_(True) + + # warm up + Softmax(input, softmax, grad_dpcpp, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, softmax, grad_dpcpp, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, softmax, grad_dpcpp, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/sort.py b/test/microbench/sort.py index 4c09d627bb..f2fba81257 100644 --- a/test/microbench/sort.py +++ b/test/microbench/sort.py @@ -1,55 +1,90 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(8193, 8193)] backward = False -num_iter = 20 -shape_list = [(8193, 8193)] -for shape in shape_list: - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for dim in [None, 0, 1]: - input = torch.randn(shape, dtype=dtype, device=device) - - # warm up - torch.sort(input) - torch.sort(input, 0) - torch.sort(input, 1) - - # go - print( - "shape:", - (shape), - "; datatype:", - dtype, - "; dim:", - dim, - "; backward:", - backward, - ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - if dim is None: - torch.sort(input) - else: - torch.sort(input, dim) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - if dim is None: - torch.sort(input) - else: - torch.sort(input, dim) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") +def Sort(input, dim, device): + if dim is None: + torch.sort(input) + else: + torch.sort(input, dim) + + +def run_profile(input, dim, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Sort(input, dim, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(input, dim, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Sort(input, dim, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + for dim in [None, 0, 1]: + input = torch.randn(shape, dtype=dtype, device=args.device) + + # warm up + Sort(input, dim, args.device) + + # go + print( + "shape:", + (shape), + "; datatype:", + dtype, + "; dim:", + dim, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile(input, dim, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(input, dim, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/sort.randperm.py b/test/microbench/sort.randperm.py index 0e8ef1f830..62b986ee79 100644 --- a/test/microbench/sort.randperm.py +++ b/test/microbench/sort.randperm.py @@ -1,34 +1,75 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [(8193)] backward = False -num_iter = 20 -shape_list = [(8193)] -for shape in shape_list: - for dtype in [torch.float32]: - # warm up - torch.randperm(shape, dtype=dtype, device=device) - - # go - print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True - ) as prof: - for i in range(num_iter): - torch.randperm(shape, dtype=dtype, device=device) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() +def Randperm(shape, dtype, device): + torch.randperm(shape, dtype=dtype, device=device) + + +def run_profile(shape, dtype, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: for i in range(num_iter): - torch.randperm(shape, dtype=dtype, device=device) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + Randperm(shape, dtype, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Randperm(shape, dtype, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): + for shape in shape_list: + for dtype in [torch.float32]: + # warm up + Randperm(shape, dtype, args.device) + + # go + print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) + if not args.e2e_only: + run_profile(shape, dtype, args.device, args.num_iter) + + if not args.profile_only: + run_e2e(shape, dtype, args.device, args.num_iter) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/upsample_bicubic2d.py b/test/microbench/upsample_bicubic2d.py index 4a985e6f05..557684cd03 100644 --- a/test/microbench/upsample_bicubic2d.py +++ b/test/microbench/upsample_bicubic2d.py @@ -1,85 +1,123 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [ + [1, 3, 1200, 1200], + [1, 128, 1200, 1200], + [1, 3, 1200, 1200], + [128, 128, 5, 5], +] +scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7]] backward = True -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device="xpu") -cache_w = torch.randn((1024 * 1024 * 1024), device="xpu") -def simple_test(in_shape, scale_factor, backward, dtype): - in_tensor = torch.randn( - in_shape, dtype=dtype, device=device, requires_grad=backward - ) +def Bicubic2d(in_tensor, scale, backward, device): output = torch.nn.functional.interpolate( - in_tensor, mode="bicubic", scale_factor=scale_factor, align_corners=True + in_tensor, + mode="bicubic", + scale_factor=scale, + align_corners=True, ) - - # warm_up - for _ in range(10): - output = torch.nn.functional.interpolate( - in_tensor, mode="bicubic", scale_factor=scale_factor, align_corners=True + if backward: + output = torch.autograd.grad( + output, in_tensor, grad_outputs=torch.ones_like(output) ) - # go - print( - "shape:", - (in_shape), - "; datatype:", - dtype, - "; scale_factor:", - scale_factor, - "; backward:", - backward, - ) + +def run_profile(in_tensor, scale, backward, cache_r, cache_w, device, num_iter): with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, ) as prof: - for i in range(num_iter): + for _ in range(num_iter): cache_r = cache_w + 1 - output = torch.nn.functional.interpolate( - in_tensor, - mode="bicubic", - scale_factor=scale_factor, - align_corners=True, - ) - if backward: - output = torch.autograd.grad( - output, in_tensor, grad_outputs=torch.ones_like(output) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) + Bicubic2d(in_tensor, scale, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + - # E2E time - torch.xpu.synchronize() +def run_e2e(in_tensor, scale, backward, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t1 = time.time() - for i in range(num_iter): + for _ in range(num_iter): cache_r = cache_w + 1 - output = torch.nn.functional.interpolate( - in_tensor, - mode="bicubic", - scale_factor=scale_factor, - align_corners=True, - ) - if backward: - output = torch.autograd.grad( - output, in_tensor, grad_outputs=torch.ones_like(output) - ) - torch.xpu.synchronize() + Bicubic2d(in_tensor, scale, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t2 = time.time() e2e_time = (t2 - t1) / num_iter print("E2E total time:", f"{float(e2e_time):.20f}") -shape_list = [ - [1, 3, 1200, 1200], - [1, 128, 1200, 1200], - [1, 3, 1200, 1200], - [128, 128, 5, 5], -] -scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7]] -for sp, sf in zip(shape_list, scale_factor): - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - simple_test(sp, sf, backward, dtype) +def benchmark(args): + for in_shape, scale in zip(shape_list, scale_factor): + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + in_tensor = torch.randn( + in_shape, dtype=dtype, device=args.device, requires_grad=backward + ) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + + # warm_up + Bicubic2d(in_tensor, scale, backward, args.device) + + # go + print( + "shape:", + (in_shape), + "; datatype:", + dtype, + "; scale_factor:", + scale, + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + in_tensor, + scale, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + in_tensor, + scale, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/upsample_bilinear2d.py b/test/microbench/upsample_bilinear2d.py index fc841edd27..48e18e75d0 100644 --- a/test/microbench/upsample_bilinear2d.py +++ b/test/microbench/upsample_bilinear2d.py @@ -1,87 +1,124 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +shape_list = [ + [1, 3, 1200, 1200], + [1, 128, 1200, 1200], + [1, 3, 1200, 1200], + [128, 128, 5, 5], + [8, 32, 256, 256], +] +scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7], 3] backward = True -num_iter = 20 -cache_r = torch.randn((1024 * 1024 * 1024), device="xpu") -cache_w = torch.randn((1024 * 1024 * 1024), device="xpu") -def simple_test(in_shape, scale_factor, backward, dtype, mode): - in_tensor = torch.randn( - in_shape, dtype=dtype, device=device, requires_grad=backward - ) +def Bilinear2d(in_tensor, scale, backward, device): output = torch.nn.functional.interpolate( - in_tensor, mode=mode, scale_factor=scale_factor + in_tensor, + mode="bilinear", + scale_factor=scale, ) - - # warm_up - for _ in range(10): - output = torch.nn.functional.interpolate( - in_tensor, mode=mode, scale_factor=scale_factor + if backward: + output = torch.autograd.grad( + output, in_tensor, grad_outputs=torch.ones_like(output) ) - # go - print( - "shape:", - (in_shape), - "; datatype:", - dtype, - "; scale_factor:", - scale_factor, - "; mode:", - mode, - "; backward:", - backward, - ) + +def run_profile(in_tensor, scale, backward, cache_r, cache_w, device, num_iter): with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, ) as prof: - for i in range(num_iter): + for _ in range(num_iter): cache_r = cache_w + 1 - output = torch.nn.functional.interpolate( - in_tensor, - mode=mode, - scale_factor=scale_factor, - ) - if backward: - output = torch.autograd.grad( - output, in_tensor, grad_outputs=torch.ones_like(output) - ) - print(prof.key_averages().table(sort_by="xpu_time_total")) + Bilinear2d(in_tensor, scale, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + - # E2E time - torch.xpu.synchronize() +def run_e2e(in_tensor, scale, backward, cache_r, cache_w, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t1 = time.time() - for i in range(num_iter): + for _ in range(num_iter): cache_r = cache_w + 1 - output = torch.nn.functional.interpolate( - in_tensor, - mode=mode, - scale_factor=scale_factor, - ) - if backward: - output = torch.autograd.grad( - output, in_tensor, grad_outputs=torch.ones_like(output) - ) - torch.xpu.synchronize() + Bilinear2d(in_tensor, scale, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() t2 = time.time() e2e_time = (t2 - t1) / num_iter print("E2E total time:", f"{float(e2e_time):.20f}") -shape_list = [ - [1, 3, 1200, 1200], - [1, 128, 1200, 1200], - [1, 3, 1200, 1200], - [128, 128, 5, 5], - [8, 32, 256, 256], -] -scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7], 3] -for sp, sf in zip(shape_list, scale_factor): - for dtype in [torch.bfloat16, torch.float16, torch.float32]: - for mode in ["bilinear"]: - simple_test(sp, sf, backward, dtype, mode) +def benchmark(args): + for in_shape, scale in zip(shape_list, scale_factor): + for dtype in [torch.bfloat16, torch.float16, torch.float32]: + in_tensor = torch.randn( + in_shape, dtype=dtype, device=args.device, requires_grad=backward + ) + cache_r = torch.randn((1024 * 1024 * 1024), device=args.device) + cache_w = torch.randn((1024 * 1024 * 1024), device=args.device) + # warm_up + Bilinear2d(in_tensor, scale, backward, args.device) + + # go + print( + "shape:", + (in_shape), + "; datatype:", + dtype, + "; scale_factor:", + scale, + "; mode:", + "bilinear", + "; backward:", + backward, + ) + if not args.e2e_only: + run_profile( + in_tensor, + scale, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + in_tensor, + scale, + backward, + cache_r, + cache_w, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/upsample_nearest2d.py b/test/microbench/upsample_nearest2d.py index 1e0a74aa4a..610bb3cac8 100644 --- a/test/microbench/upsample_nearest2d.py +++ b/test/microbench/upsample_nearest2d.py @@ -1,20 +1,19 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 - shape_list = [ (8, 32, 256, 256, (3)), (8, 512, 16, 16, (1.5)), (16, 1024, 23, 23, (2.3)), (4, 32, 80, 128, (2)), ] +backward = True -def Interpolate2d(shape, dtype, channels_last, backward, mode): +def Interpolate2d(shape, dtype, channels_last, backward, mode, device): N, C, H, W, scale_factor = shape[0], shape[1], shape[2], shape[3], shape[4] if channels_last: @@ -34,14 +33,41 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode): output.backward(torch.ones_like(output)) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, mode, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Interpolate2d(shape, dtype, channels_last, backward, mode, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, mode, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Interpolate2d(shape, dtype, channels_last, backward, mode, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: for mode in ["nearest"]: # warm up - Interpolate2d(shape, dtype, channels_last, backward, mode) + Interpolate2d( + shape, dtype, channels_last, backward, mode, args.device + ) # go print( @@ -58,20 +84,46 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Interpolate2d(shape, dtype, channels_last, backward, mode) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Interpolate2d(shape, dtype, channels_last, backward, mode) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + mode, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + mode, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/upsample_nearest3d.py b/test/microbench/upsample_nearest3d.py index 841b404296..cab96e0173 100644 --- a/test/microbench/upsample_nearest3d.py +++ b/test/microbench/upsample_nearest3d.py @@ -1,19 +1,18 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 - shape_list = [ (8, 32, 256, 256, 2, (3)), (8, 512, 16, 16, 4, (1.5)), (16, 1024, 23, 23, 7, (2.3)), ] +backward = True -def Interpolate3d(shape, dtype, channels_last, backward): +def Interpolate3d(shape, dtype, channels_last, backward, device): N, C, H, W, D, scale_factor = ( shape[0], shape[1], @@ -42,13 +41,38 @@ def Interpolate3d(shape, dtype, channels_last, backward): output.backward(torch.ones_like(output)) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Interpolate3d(shape, dtype, channels_last, backward, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Interpolate3d(shape, dtype, channels_last, backward, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: # warm up - Interpolate3d(shape, dtype, channels_last, backward=True) + Interpolate3d(shape, dtype, channels_last, backward, args.device) # go print( @@ -63,20 +87,44 @@ def Interpolate3d(shape, dtype, channels_last, backward): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Interpolate3d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Interpolate3d(shape, dtype, channels_last, backward=True) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args) diff --git a/test/microbench/upsample_nearest_exact2d.py b/test/microbench/upsample_nearest_exact2d.py index 3eb043023e..0ade8e670a 100644 --- a/test/microbench/upsample_nearest_exact2d.py +++ b/test/microbench/upsample_nearest_exact2d.py @@ -1,20 +1,19 @@ +import argparse import time import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" -num_iter = 20 - shape_list = [ (8, 32, 256, 256, (3)), (8, 512, 16, 16, (1.5)), (16, 1024, 23, 23, (2.3)), (4, 32, 80, 128, (2)), ] +backward = True -def Interpolate2d(shape, dtype, channels_last, backward, mode): +def Interpolate2d(shape, dtype, channels_last, backward, mode, device): N, C, H, W, scale_factor = shape[0], shape[1], shape[2], shape[3], shape[4] if channels_last: @@ -34,14 +33,41 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode): output.backward(torch.ones_like(output)) -if __name__ == "__main__": - backward = True +def run_profile(shape, dtype, channels_last, backward, mode, device, num_iter): + with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + for i in range(num_iter): + Interpolate2d(shape, dtype, channels_last, backward, mode, device) + print(prof.key_averages().table(sort_by=f"{device}_time_total")) + + +def run_e2e(shape, dtype, channels_last, backward, mode, device, num_iter): + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t1 = time.time() + for i in range(num_iter): + Interpolate2d(shape, dtype, channels_last, backward, mode, device) + if device in ["xpu", "cuda"]: + torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize() + t2 = time.time() + e2e_time = (t2 - t1) / num_iter + print("E2E total time:", f"{float(e2e_time):.20f}") + + +def benchmark(args): for shape in shape_list: for dtype in [torch.bfloat16, torch.float16, torch.float32]: for channels_last in [False, True]: for mode in ["nearest-exact"]: # warm up - Interpolate2d(shape, dtype, channels_last, backward, mode) + Interpolate2d( + shape, dtype, channels_last, backward, mode, args.device + ) # go print( @@ -58,20 +84,46 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode): "; backward:", backward, ) - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], - record_shapes=True, - ) as prof: - for i in range(num_iter): - Interpolate2d(shape, dtype, channels_last, backward, mode) - print(prof.key_averages().table(sort_by="xpu_time_total")) - - # E2E time - torch.xpu.synchronize() - t1 = time.time() - for i in range(num_iter): - Interpolate2d(shape, dtype, channels_last, backward, mode) - torch.xpu.synchronize() - t2 = time.time() - e2e_time = (t2 - t1) / num_iter - print("E2E total time:", f"{float(e2e_time):.20f}") + if not args.e2e_only: + run_profile( + shape, + dtype, + channels_last, + backward, + mode, + args.device, + args.num_iter, + ) + + if not args.profile_only: + run_e2e( + shape, + dtype, + channels_last, + backward, + mode, + args.device, + args.num_iter, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="OP Benchmark") + parser.add_argument( + "--device", + type=str, + default="xpu", + help='Device to run on (e.g., "cpu", "cuda", "xpu")', + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--profile-only", action="store_true", help="Only Run profile timing" + ) + group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing") + parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark(args)