diff --git a/.github/scripts/op_calculate_best_perf.py b/.github/scripts/op_calculate_best_perf.py
index aa5c1e29f5..21bb0f2551 100644
--- a/.github/scripts/op_calculate_best_perf.py
+++ b/.github/scripts/op_calculate_best_perf.py
@@ -14,13 +14,32 @@
 updated_cases = []
 removed_cases = []
 
+def safe_float_convert(value):
+    try:
+        return float(value) if value.strip() else None
+    except (ValueError, AttributeError):
+        return None
+
 def update_baseline(xpu_file, baseline_file, remove_missing=False):
     with open(xpu_file) as f:
         xpu_reader = csv.DictReader(f, delimiter=';')
         xpu_rows = list(xpu_reader)
-        xpu_fieldnames = xpu_reader.fieldnames  # Keep original field order
-        fieldnames = [f for f in xpu_fieldnames if f not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']]
-        xpu_data = {make_key(row, fieldnames): (float(row['time(us)']), row) for row in xpu_rows}
+        xpu_fieldnames = xpu_reader.fieldnames
+        time_fields = ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']
+        fieldnames = [f for f in xpu_fieldnames if f not in time_fields]
+        xpu_data = {}
+        for row in xpu_rows:
+            key = make_key(row, fieldnames)
+            time_values = {}
+            if 'time(us)' in row:
+                time_val = safe_float_convert(row['time(us)'])
+                if time_val is not None:
+                    time_values['time(us)'] = time_val
+            if 'E2E total time(us)' in row:
+                e2e_val = safe_float_convert(row['E2E total time(us)'])
+                if e2e_val is not None:
+                    time_values['E2E total time(us)'] = e2e_val
+            xpu_data[key] = (time_values, row)
 
     with open(baseline_file) as f:
         baseline_reader = csv.DictReader(f, delimiter=';')
@@ -28,8 +47,15 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False):
         baseline_fieldnames = baseline_reader.fieldnames
 
     # To add new parameter of new ops into baseline file
-    all_fieldnames = xpu_fieldnames + [f for f in baseline_fieldnames if f not in xpu_fieldnames]
-    fieldnames = [f for f in all_fieldnames if f not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']]
+    all_fieldnames = list(set(xpu_fieldnames + baseline_fieldnames))
+    # Maintain original order as much as possible
+    ordered_fieldnames = []
+    for f in xpu_fieldnames:
+        if f in all_fieldnames and f not in ordered_fieldnames:
+            ordered_fieldnames.append(f)
+    for f in baseline_fieldnames:
+        if f in all_fieldnames and f not in ordered_fieldnames:
+            ordered_fieldnames.append(f)
 
     baseline_keys = {make_key(row, fieldnames) for row in baseline_rows}
     xpu_keys = set(xpu_data.keys())
@@ -38,75 +64,117 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False):
     for row in baseline_rows:
         key = make_key(row, fieldnames)
         if key in xpu_data:
-            xpu_time, xpu_row = xpu_data[key]
-            baseline_time = float(row['time(us)'])
-
-            if xpu_time < baseline_time:
-                updated_row = {}
-                for field in all_fieldnames:
-                    updated_row[field] = xpu_row.get(field, row.get(field, ''))
-                updated_row['time(us)'] = str(xpu_time)
-                if 'E2E total time(us)' in row:
-                    updated_row['E2E total time(us)'] = row['E2E total time(us)']
-                updated_cases.append((key, baseline_time, xpu_time, updated_row))
-                updated_rows.append(updated_row)
-            else:
-                ordered_row = {}
-                for field in all_fieldnames:
-                    ordered_row[field] = row.get(field, '')
-                updated_rows.append(ordered_row)
+            xpu_times, xpu_row = xpu_data[key]
+            updated_row = {}
+
+            # Copy all fields from baseline first
+            for field in ordered_fieldnames:
+                updated_row[field] = row.get(field, '')
+
+            # Update with xpu values where they exist
+            for field in ordered_fieldnames:
+                if field in xpu_row and xpu_row[field]:
+                    updated_row[field] = xpu_row[field]
+
+            # Handle time fields
+            updated = False
+            if 'time(us)' in xpu_times and 'time(us)' in row:
+                baseline_time = safe_float_convert(row['time(us)'])
+                if baseline_time is not None:
+                    xpu_time = xpu_times['time(us)']
+                    if xpu_time < baseline_time:
+                        updated_row['time(us)'] = str(xpu_time)
+                        updated = True
+
+            if 'E2E total time(us)' in xpu_times and 'E2E total time(us)' in row:
+                baseline_e2e = safe_float_convert(row['E2E total time(us)'])
+                if baseline_e2e is not None:
+                    xpu_e2e = xpu_times['E2E total time(us)']
+                    if xpu_e2e < baseline_e2e:
+                        updated_row['E2E total time(us)'] = str(xpu_e2e)
+                        updated = True
+
+            if updated:
+                updated_cases.append((key, row, updated_row))
+            updated_rows.append(updated_row)
         elif not remove_missing:
             ordered_row = {}
-            for field in all_fieldnames:
+            for field in ordered_fieldnames:
                 ordered_row[field] = row.get(field, '')
             updated_rows.append(ordered_row)
 
     # Add new cases
     for key in xpu_keys - baseline_keys:
-        xpu_time, xpu_row = xpu_data[key]
+        xpu_times, xpu_row = xpu_data[key]
         new_row = {}
-        for field in all_fieldnames:
+        for field in ordered_fieldnames:
             new_row[field] = xpu_row.get(field, '')
-        new_row['time(us)'] = str(xpu_time)
+
+        if 'time(us)' in xpu_times:
+            new_row['time(us)'] = str(xpu_times['time(us)'])
+        if 'E2E total time(us)' in xpu_times:
+            new_row['E2E total time(us)'] = str(xpu_times['E2E total time(us)'])
+
         updated_rows.append(new_row)
-        added_cases.append((key, xpu_time, new_row))
+        added_cases.append((key, xpu_times, new_row))
 
     # Resolve removed cases
     if remove_missing:
         for key in baseline_keys - xpu_keys:
             removed_case = next(row for row in baseline_rows if make_key(row, fieldnames) == key)
-            removed_cases.append((key, float(removed_case['time(us)']), removed_case))
+            removed_cases.append((key, removed_case))
 
     if added_cases:
         print(f"\nAdded {len(added_cases)} new case(s):")
-        for key, time, row in added_cases:
+        for key, times, row in added_cases:
             print(f"\n[New Case] {format_case(key)}")
-            print(f"Time: {time} us")
+            if 'time(us)' in times:
+                print(f"Time: {times['time(us)']} us")
+            if 'E2E total time(us)' in times:
+                print(f"E2E Time: {times['E2E total time(us)']} us")
             print("Parameters:")
             for k, v in row.items():
-                if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']:
+                if k not in time_fields:
                     print(f"  {k}: {v}")
         print("-" * 60)
 
     if updated_cases:
         print(f"\nUpdated {len(updated_cases)} case(s):")
-        for key, old_time, new_time, row in updated_cases:
+        for key, old_row, new_row in updated_cases:
             print(f"\n[Updated] {format_case(key)}")
-            print(f"Time: {old_time} us → {new_time} us")
+            if 'time(us)' in old_row and 'time(us)' in new_row:
+                old_time = safe_float_convert(old_row['time(us)'])
+                new_time = safe_float_convert(new_row['time(us)'])
+                if old_time is not None and new_time is not None and old_time != new_time:
+                    print(f"Time: {old_time} us → {new_time} us")
+
+            if 'E2E total time(us)' in old_row and 'E2E total time(us)' in new_row:
+                old_e2e = safe_float_convert(old_row['E2E total time(us)'])
+                new_e2e = safe_float_convert(new_row['E2E total time(us)'])
+                if old_e2e is not None and new_e2e is not None and old_e2e != new_e2e:
+                    print(f"E2E Time: {old_e2e} us → {new_e2e} us")
+
             print("Parameters:")
-            for k, v in row.items():
-                if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']:
+            for k, v in new_row.items():
+                if k not in time_fields:
                     print(f"  {k}: {v}")
         print("-" * 60)
 
     if remove_missing and removed_cases:
         print(f"\nRemoved {len(removed_cases)} case(s):")
-        for key, time, row in removed_cases:
+        for key, row in removed_cases:
             print(f"\n[Removed] {format_case(key)}")
-            print(f"Time: {time} us")
+            if 'time(us)' in row:
+                time_val = safe_float_convert(row['time(us)'])
+                if time_val is not None:
+                    print(f"Time: {time_val} us")
+            if 'E2E total time(us)' in row:
+                e2e_val = safe_float_convert(row['E2E total time(us)'])
+                if e2e_val is not None:
+                    print(f"E2E Time: {e2e_val} us")
             print("Parameters:")
             for k, v in row.items():
-                if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']:
+                if k not in time_fields:
                     print(f"  {k}: {v}")
         print("-" * 60)
 
@@ -117,7 +185,7 @@ def update_baseline(xpu_file, baseline_file, remove_missing=False):
     Path(baseline_file).rename(backup_file)
 
     with open(baseline_file, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=all_fieldnames, delimiter=';')
+        writer = csv.DictWriter(f, fieldnames=ordered_fieldnames, delimiter=';')
         writer.writeheader()
         writer.writerows(updated_rows)
 
diff --git a/.github/scripts/op_perf_comparison.py b/.github/scripts/op_perf_comparison.py
index 0aeaa0893e..f531e4d1f7 100644
--- a/.github/scripts/op_perf_comparison.py
+++ b/.github/scripts/op_perf_comparison.py
@@ -2,7 +2,9 @@
 To compare the op perf diff
 # usage
 python op_perf_comparison.py --xpu_file /path/to/xpu/performance/result/dir/forward.csv --baseline_file /path/to/baselineence/dir/baseline.csv
-
+--profile-only: Only compare record['time(us)']
+--e2e-only: Only compare record['E2E total time(us)']
+Default: Compare both record['time(us)'] and record['E2E total time(us)'] in same table
 """
 
 import pandas as pd
@@ -43,12 +45,15 @@ def write_to_github_summary(content):
 def format_parameters(record):
     params = []
     for key, value in record.items():
-        if key not in ['case_name', 'op_name', 'datatype', 'time_xpu_file', 'time_baseline_file', 'difference', 'change', 'E2E total time(us)', 'E2E forward time(us)']:
+        if key not in ['case_name', 'op_name', 'datatype',
+                       'profile_time_xpu', 'profile_time_base', 'profile_diff', 'profile_change',
+                       'e2e_time_xpu', 'e2e_time_base', 'e2e_diff', 'e2e_change',
+                       'E2E total time(us)', 'E2E forward time(us)']:
             if value != "NULL":
                 params.append(f"{key}: {value}")
     return "<br>".join(params)
 
-def display_comparison(results, threshold, xpu_file):
+def display_comparison(results, threshold, xpu_file, compare_both):
     if 'forward' in xpu_file.lower():
         direction = "Forward"
     elif 'backward' in xpu_file.lower():
@@ -61,97 +66,132 @@ def display_comparison(results, threshold, xpu_file):
         write_to_github_summary(f"## {direction} No outlier exceeding ({threshold:.0%})")
         return
 
-    results['diff_float'] = results['difference'].str.rstrip('%').astype(float)
-    regression = results[results['change'] == '↓'].sort_values('diff_float', ascending=False)
-    improvement = results[results['change'] == '↑'].sort_values('diff_float')
+    # Prepare display records - always include both metrics when available
+    display_records = []
+    for _, row in results.iterrows():
+        record = display_row(row)
+        display_record = {
+            'Case Name': record['case_name'],
+            'Op Name': record['op_name'],
+            'Datatype': record['datatype'],
+            'Parameters': format_parameters(record)
+        }
 
-    if not regression.empty:
-        print("\n🔴 Regression:")
-        display_records = []
-        for _, row in regression.iterrows():
-            record = display_row(row)
-            display_records.append({
-                'Case Name': record['case_name'],
-                'Op Name': record['op_name'],
-                'Datatype': record['datatype'],
-                'Parameters': format_parameters(record),
-                'Current Time(us)': record['time_xpu_file'],
-                'Baseline Time(us)': record['time_baseline_file'],
-                'Difference': record['difference']
+        # Always try to include profile time if it exists in the data
+        if 'profile_time_xpu' in record or 'profile_time_base' in record:
+            display_record.update({
+                'Profile Current(us)': record.get('profile_time_xpu', 'N/A'),
+                'Profile Baseline(us)': record.get('profile_time_base', 'N/A'),
+                'Profile Diff': record.get('profile_diff', 'N/A'),
+                'Profile Change': record.get('profile_change', '')
+            })
+
+        # Always try to include E2E time if it exists in the data
+        if 'e2e_time_xpu' in record or 'e2e_time_base' in record:
+            display_record.update({
+                'E2E Current(us)': record.get('e2e_time_xpu', 'N/A'),
+                'E2E Baseline(us)': record.get('e2e_time_base', 'N/A'),
+                'E2E Diff': record.get('e2e_diff', 'N/A'),
+                'E2E Change': record.get('e2e_change', '')
             })
 
+        display_records.append(display_record)
+
+    # Classify records based on changes
+    regression_records = []
+    improvement_records = []
+    mixed_records = []
+
+    for record in results.to_dict('records'):
+        profile_change = record.get('profile_change')
+        e2e_change = record.get('e2e_change')
+
+        profile_regression = profile_change == '↓'
+        profile_improve = profile_change == '↑'
+        e2e_regression = e2e_change == '↓'
+        e2e_improve = e2e_change == '↑'
+
+        if (profile_regression and e2e_improve) or (profile_improve and e2e_regression):
+            mixed_records.append(record)
+        elif profile_regression or e2e_regression:
+            regression_records.append(record)
+        elif profile_improve or e2e_improve:
+            improvement_records.append(record)
+
+    # Print results
+    if regression_records:
+        print("\n🔴 Regression:")
+        regression_display = [r for r in display_records
+                            if (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')
+                            and not (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')]
         print(tabulate(
-            display_records,
+            regression_display,
             headers="keys",
             tablefmt='grid',
             showindex=False,
             floatfmt=".2f"
         ))
 
-    if not improvement.empty:
+    if improvement_records:
         print("\n🟢 Improvement:")
-        display_records = []
-        for _, row in improvement.iterrows():
-            record = display_row(row)
-            display_records.append({
-                'Case Name': record['case_name'],
-                'Op Name': record['op_name'],
-                'Datatype': record['datatype'],
-                'Parameters': format_parameters(record),
-                'Current Time(us)': record['time_xpu_file'],
-                'Baseline Time(us)': record['time_baseline_file'],
-                'Difference': record['difference']
-            })
+        improvement_display = [r for r in display_records
+                             if (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')
+                             and not (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')]
+        print(tabulate(
+            improvement_display,
+            headers="keys",
+            tablefmt='grid',
+            showindex=False,
+            floatfmt=".2f"
+        ))
 
+    if mixed_records:
+        print("\n🟡 Mixed Changes (one metric improves, another regression):")
+        mixed_display = [r for r in display_records
+                       if ((r.get('Profile Change', '') == '↑' and r.get('E2E Change', '') == '↓') or
+                           (r.get('Profile Change', '') == '↓' and r.get('E2E Change', '') == '↑'))]
         print(tabulate(
-            display_records,
+            mixed_display,
             headers="keys",
             tablefmt='grid',
             showindex=False,
             floatfmt=".2f"
         ))
-    # Print Summary on Github Action Summary
+
+    # Generate GitHub summary
     summary_output = f"## {direction} Performance Comparison Results\n"
-    if not regression.empty:
-        summary_output += f"\n### 🔴 {direction} Regression\n"
-        display_records = []
-        for _, row in regression.iterrows():
-            record = display_row(row)
-            display_records.append({
-                'Case Name': record['case_name'],
-                'Op Name': record['op_name'],
-                'Datatype': record['datatype'],
-                'Parameters': format_parameters(record),
-                'Current Time(us)': record['time_xpu_file'],
-                'Baseline Time(us)': record['time_baseline_file'],
-                'Difference': record['difference']
-            })
 
+    if regression_records:
+        summary_output += "\n### 🔴 Regression\n"
         summary_output += tabulate(
-            display_records,
+            [r for r in display_records
+                if (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')
+                and not (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')],
             headers="keys",
             tablefmt='github',
             showindex=False,
             floatfmt=".2f"
         ) + "\n"
 
-    if not improvement.empty:
-        summary_output += f"\n### 🟢 {direction} Improvement\n"
-        display_records = []
-        for _, row in improvement.iterrows():
-            record = display_row(row)
-            display_records.append({
-                'Case Name': record['case_name'],
-                'Op Name': record['op_name'],
-                'Datatype': record['datatype'],
-                'Parameters': format_parameters(record),
-                'Current Time(us)': record['time_xpu_file'],
-                'Baseline Time(us)': record['time_baseline_file'],
-                'Difference': record['difference']
-            })
+    if improvement_records:
+        summary_output += "\n### 🟢 Improvement\n"
+        summary_output += tabulate(
+            [r for r in display_records
+                if (r.get('Profile Change', '') == '↑' or r.get('E2E Change', '') == '↑')
+                and not (r.get('Profile Change', '') == '↓' or r.get('E2E Change', '') == '↓')],
+            headers="keys",
+            tablefmt='github',
+            showindex=False,
+            floatfmt=".2f"
+        ) + "\n"
 
+    if mixed_records:
+        summary_output += "\n### 🟡 Mixed Changes\n"
+        summary_output += "One metric improves while another regression\n"
         summary_output += tabulate(
-            display_records,
+            [r for r in display_records
+                if ((r.get('Profile Change', '') == '↑' and r.get('E2E Change', '') == '↓') or
+                    (r.get('Profile Change', '') == '↓' and r.get('E2E Change', '') == '↑'))],
             headers="keys",
             tablefmt='github',
             showindex=False,
@@ -160,50 +200,100 @@ def display_comparison(results, threshold, xpu_file):
 
     write_to_github_summary(summary_output)
 
-def compare_op_time_values(xpu_file, baseline_file, threshold=0.05, output_file=None):
-    df_xpu = pd.read_csv(xpu_file, sep=';')
-    df_baseline = pd.read_csv(baseline_file, sep=';')
+def compare_time_values(xpu_file, baseline_file, threshold=0.05, profile_only=False, e2e_only=False):
+    def prepare_df(df):
+        df.columns = df.columns.str.strip()
+        if 'time(us)' not in df.columns:
+            df['time(us)'] = float('nan')
+        if 'E2E total time(us)' not in df.columns:
+            df['E2E total time(us)'] = float('nan')
+        return df
+
+    df_xpu = prepare_df(pd.read_csv(xpu_file, sep=';'))
+    df_baseline = prepare_df(pd.read_csv(baseline_file, sep=';'))
+
+    for col in ['time(us)', 'E2E total time(us)']:
+        df_xpu[col] = pd.to_numeric(df_xpu[col], errors='coerce')
+        df_baseline[col] = pd.to_numeric(df_baseline[col], errors='coerce')
 
     records_xpu = [preprocess_row(row) for _, row in df_xpu.iterrows()]
     records_baseline = [preprocess_row(row) for _, row in df_baseline.iterrows()]
 
-    dict_xpu = {
-        tuple((k, str(v)) for k, v in record.items() if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']):
-        record['time(us)']
-        for record in records_xpu
-    }
-    dict_baseline = {
-        tuple((k, str(v)) for k, v in record.items() if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)']):
-        record['time(us)']
-        for record in records_baseline
+    data_dict = {
+        'xpu': {'profile': {}, 'e2e': {}},
+        'baseline': {'profile': {}, 'e2e': {}}
     }
-    common_keys = set(dict_xpu.keys()) & set(dict_baseline.keys())
+
+    for record, source in [(records_xpu, 'xpu'), (records_baseline, 'baseline')]:
+        for r in record:
+            key = tuple((k, str(v)) for k, v in r.items()
+                   if k not in ['time(us)', 'E2E total time(us)', 'E2E forward time(us)'])
+
+            for time_type in ['profile', 'e2e']:
+                col = 'time(us)' if time_type == 'profile' else 'E2E total time(us)'
+                if col in r:
+                    try:
+                        time_val = float(r[col])
+                        if not pd.isna(time_val):
+                            data_dict[source][time_type][key] = time_val
+                    except (ValueError, TypeError):
+                        continue
+
     results = []
+    compare_both = not profile_only and not e2e_only
+    all_keys = set().union(*[set(data_dict[s][t].keys())
+                          for s in data_dict for t in data_dict[s]])
 
-    for key in common_keys:
-        time_xpu = dict_xpu[key]
-        time_baseline = dict_baseline[key]
-
-        # Skip comparison if time_xpu or time_baseline is 0
-        if time_xpu == 0 or time_baseline == 0:
-            continue
-
-        diff = (time_baseline - time_xpu) / time_xpu
-        # Compare Time, Lower is better
-        if abs(diff) > threshold:
-            record = dict(key)
-            print(record)
-            record.update({
-                'time_xpu_file': time_xpu,
-                'time_baseline_file': time_baseline,
-                'difference': f"{diff:.2%}",
-                'change': "↑" if diff > 0 else "↓"
-            })
-            results.append(record)
+    for key in all_keys:
+        record = dict(key)
+        should_include = False
 
-    result_df = pd.DataFrame(results) if results else pd.DataFrame()
-    display_comparison(result_df, threshold, xpu_file)
+        if not e2e_only and key in data_dict['xpu']['profile'] and key in data_dict['baseline']['profile']:
+            xpu_time = data_dict['xpu']['profile'][key]
+            base_time = data_dict['baseline']['profile'][key]
 
+            if xpu_time != 0 and base_time != 0:
+                try:
+                    diff = (base_time - xpu_time) / xpu_time
+                    record.update({
+                        'profile_time_xpu': xpu_time,
+                        'profile_time_base': base_time,
+                        'profile_diff': f"{diff:.2%}",
+                        'profile_change': "↑" if diff > threshold else "↓" if diff < -threshold else ""
+                    })
+                    if abs(diff) > threshold:
+                        should_include = True
+                except (TypeError, ValueError):
+                    pass
+
+        if not profile_only and key in data_dict['xpu']['e2e'] and key in data_dict['baseline']['e2e']:
+            xpu_time = data_dict['xpu']['e2e'][key]
+            base_time = data_dict['baseline']['e2e'][key]
+
+            if xpu_time != 0 and base_time != 0:
+                try:
+                    diff = (base_time - xpu_time) / xpu_time
+                    record.update({
+                        'e2e_time_xpu': xpu_time,
+                        'e2e_time_base': base_time,
+                        'e2e_diff': f"{diff:.2%}",
+                        'e2e_change': "↑" if diff > threshold else "↓" if diff < -threshold else ""
+                    })
+                    if abs(diff) > threshold:
+                        should_include = True
+                except (TypeError, ValueError):
+                    pass
+
+        if compare_both:
+            if should_include:
+                results.append(record)
+        else:
+            if ((profile_only and 'profile_change' in record and record['profile_change']) or
+                (e2e_only and 'e2e_change' in record and record['e2e_change'])):
+                results.append(record)
+
+    result_df = pd.DataFrame(results) if results else pd.DataFrame()
+    display_comparison(result_df, threshold, xpu_file, compare_both)
 
 def main():
     parser = argparse.ArgumentParser(description='Compare time values between two CSV files')
@@ -211,19 +301,40 @@ def main():
     parser.add_argument('-b', '--baseline_file', required=True, help="XPU OP baseline result csv files dir")
     parser.add_argument('-t', '--threshold', type=float, default=0.10,
                        help='Threshold for time difference (default: 0.10 for 10%)')
+    parser.add_argument('--profile-only', action='store_true',
+                       help='Only compare profile time (time(us))')
+    parser.add_argument('--e2e-only', action='store_true',
+                       help='Only compare E2E time (E2E total time(us))')
     args = parser.parse_args()
 
-    print(f" Compared file: {args.xpu_file} 和 {args.baseline_file}")
+    if args.profile_only and args.e2e_only:
+        raise ValueError("Cannot specify both --profile-only and --e2e-only")
+
+    print(f" Compared file: {args.xpu_file} and {args.baseline_file}")
     print(f" Threshold: {args.threshold:.0%}")
+    if args.profile_only:
+        print(" Comparing only profile time (time(us))")
+    elif args.e2e_only:
+        print(" Comparing only E2E time (E2E total time(us))")
+    else:
+        print(" Comparing both profile time and E2E time in same table")
+
     write_to_github_summary("## Performance Comparison Set")
     write_to_github_summary(f"- Threshold: {args.threshold:.0%}")
+    if args.profile_only:
+        write_to_github_summary("- Comparing only profile time (time(us))")
+    elif args.e2e_only:
+        write_to_github_summary("- Comparing only E2E time (E2E total time(us))")
+    else:
+        write_to_github_summary("- Comparing both profile time and E2E time in same table")
 
-    compare_op_time_values(
+    compare_time_values(
         xpu_file=args.xpu_file,
         baseline_file=args.baseline_file,
         threshold=args.threshold,
+        profile_only=args.profile_only,
+        e2e_only=args.e2e_only
     )
 
-
 if __name__ == "__main__":
     main()
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index 9760e6d960..35a5a62536 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -152,18 +152,25 @@ jobs:
         run: |
           REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
             --json body -q .body |grep "Inductor-XPU-OP-Benchmark-Data" |sed 's/.*: *//')"
-          gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-XPU-OP-Benchmark-Data-*"
           rm -rf ${{ github.workspace }}/reference
           mkdir -p ${{ github.workspace }}/reference
-          mv -f Inductor-XPU-OP-Benchmark-Data-*-Updated/* ${{ github.workspace }}/reference
-
           mkdir -p ${{ github.workspace }}/baseline
-          if [[ -f "${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv" ]]; then
-            cp ${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv ${{ github.workspace }}/baseline
-            cp ${{ github.workspace }}/reference/new_baseline/baseline_backward_op_summary.csv ${{ github.workspace }}/baseline
+          if [[ -n "${REFERENCE_RUN_ID}" ]]; then
+            echo "Using reference run ID: ${REFERENCE_RUN_ID}"
+            gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-XPU-OP-Benchmark-Data-*"
+            mv -f Inductor-XPU-OP-Benchmark-Data-*-Updated/* ${{ github.workspace }}/reference
+
+            if [[ -f "${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv" ]]; then
+              cp ${{ github.workspace }}/reference/new_baseline/baseline_forward_op_summary.csv ${{ github.workspace }}/baseline
+              cp ${{ github.workspace }}/reference/new_baseline/baseline_backward_op_summary.csv ${{ github.workspace }}/baseline
+            else
+              cp ${{ github.workspace }}/reference/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv
+              cp ${{ github.workspace }}/reference/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv
+            fi
           else
-            cp ${{ github.workspace }}/reference/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv
-            cp ${{ github.workspace }}/reference/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv
+            echo "No reference run ID found, using local op_benchmark as baseline"
+            cp ${{ github.workspace }}/op_benchmark/forward_op_summary.csv ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv
+            cp ${{ github.workspace }}/op_benchmark/backward_op_summary.csv ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv
           fi
       - name: Check the OP Regression
         run: |
@@ -175,7 +182,7 @@ jobs:
       - name: Update OP Baseline
         run: |
           pip install tabulate pandas
-          mkdir ${{ github.workspace }}/new_baseline
+          mkdir -p ${{ github.workspace }}/new_baseline
           cp ${{ github.workspace }}/baseline/baseline*.csv ${{ github.workspace }}/new_baseline
           # Update forward op
           python ${{ github.workspace }}/.github/scripts/op_calculate_best_perf.py --xpu ${{ github.workspace }}/op_benchmark/forward_op_summary.csv --baseline ${{ github.workspace }}/new_baseline/baseline_forward_op_summary.csv -r
diff --git a/test/microbench/adaptive_avg_pool2d.py b/test/microbench/adaptive_avg_pool2d.py
index a334a9252b..656c74ff73 100644
--- a/test/microbench/adaptive_avg_pool2d.py
+++ b/test/microbench/adaptive_avg_pool2d.py
@@ -1,18 +1,17 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-
 shape_list = [
     (8, 512, 32, 32, (7, 7)),
     (8, 256, 56, 56, (14, 14)),
 ]
-num_iter = 20
+backward = True
 
 
-def Adaptive_AVGPool2d(shape, dtype, channels_last, backward):
+def Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, output_size = (
         shape[0],
         shape[1],
@@ -48,13 +47,38 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                Adaptive_AVGPool2d(shape, dtype, channels_last, backward)
+                Adaptive_AVGPool2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -69,20 +93,44 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        Adaptive_AVGPool2d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    Adaptive_AVGPool2d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/avg_pool2d.py b/test/microbench/avg_pool2d.py
index 658bf8a56f..cbb1ab2e2c 100644
--- a/test/microbench/avg_pool2d.py
+++ b/test/microbench/avg_pool2d.py
@@ -1,20 +1,19 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-
 shape_list = [
     (16, 24, 112, 112, (3), (2)),
     (16, 1984, 7, 7, (3, 2), (2, 1)),
     (64, 1024, 112, 112, (6), (4)),
     (16, 2048, 224, 224, (3), (2)),
 ]
-num_iter = 20
+backward = True
 
 
-def AVGPool2d(shape, dtype, channels_last, backward):
+def AVGPool2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, kernel_size, stride = (
         shape[0],
         shape[1],
@@ -55,13 +54,38 @@ def AVGPool2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            AVGPool2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        AVGPool2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                AVGPool2d(shape, dtype, channels_last, backward)
+                AVGPool2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -78,20 +102,45 @@ def AVGPool2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        AVGPool2d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    AVGPool2d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/avg_pool3d.py b/test/microbench/avg_pool3d.py
index 6d9057f89d..77cf810341 100644
--- a/test/microbench/avg_pool3d.py
+++ b/test/microbench/avg_pool3d.py
@@ -1,19 +1,18 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-
 shape_list = [
     (16, 24, 28, 19, 19, (3), (2)),
     (16, 1984, 7, 7, 7, (3, 2, 2), (2, 1, 2)),
     (64, 1024, 14, 14, 14, (6), (4)),
 ]
-num_iter = 20
+backward = True
 
 
-def AVGPool3d(shape, dtype, channels_last, backward):
+def AVGPool3d(shape, dtype, channels_last, backward, device):
     N, C, D, H, W, kernel_size, stride = (
         shape[0],
         shape[1],
@@ -55,13 +54,38 @@ def AVGPool3d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            AVGPool3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        AVGPool3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                AVGPool3d(shape, dtype, channels_last, backward)
+                AVGPool3d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -78,20 +102,45 @@ def AVGPool3d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        AVGPool3d(shape, dtype, channels_last, backward=True)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    AVGPool3d(shape, dtype, channels_last, backward=True)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/batch_norm_1d.py b/test/microbench/batch_norm_1d.py
index b55d96de72..0efc3da933 100644
--- a/test/microbench/batch_norm_1d.py
+++ b/test/microbench/batch_norm_1d.py
@@ -1,56 +1,92 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [((64, 8), (8)), ((4, 128, 15000), (128)), ((4, 256, 512), (256))]
+backward = True
 
-for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-    for shape in shape_list:
-        backward = True
-        # input
-        input = torch.randn(shape[0], device=device, dtype=dtype)
-
-        if backward:
-            input.requires_grad_(True)
-
-        # warm up
-        m = torch.nn.BatchNorm1d(shape[1], device=device)
-        output = m(input)
-
-        print(
-            "shape:",
-            shape[0],
-            "; datatype:",
-            dtype,
-            "; num_features:",
-            shape[1],
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                m = torch.nn.BatchNorm1d(shape[1], device=device)
-                output = m(input)
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+def BTN1d(m, input, backward, device):
+    output = m(input)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
+
+
+def run_profile(m, input, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            m = torch.nn.BatchNorm1d(shape[1], device=device)
-            output = m(input)
+            BTN1d(m, input, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(m, input, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        BTN1d(m, input, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], device=args.device, dtype=dtype)
             if backward:
-                gy = torch.empty_like(output)
-                output.backward(gy)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                input.requires_grad_(True)
+            m = torch.nn.BatchNorm1d(shape[1], device=args.device)
+            # warm up
+            BTN1d(m, input, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                shape[0],
+                "; datatype:",
+                dtype,
+                "; num_features:",
+                shape[1],
+                "; backward:",
+                backward,
+            )
+
+            if not args.e2e_only:
+                run_profile(m, input, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(m, input, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/batch_norm_2d.py b/test/microbench/batch_norm_2d.py
index 781e00881d..e8f83ad379 100644
--- a/test/microbench/batch_norm_2d.py
+++ b/test/microbench/batch_norm_2d.py
@@ -1,10 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [
     (256, 256, 56, 56, 256),
     (256, 2048, 7, 7, 2048),
@@ -13,9 +12,10 @@
     (4, 8, 640, 1024, 8),
     (4, 48, 20, 32, 48),
 ]
+backward = True
 
 
-def BTN2d(shape, dtype, channels_last, backward):
+def BTN2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, num_features = shape[0], shape[1], shape[2], shape[3], shape[4]
 
     if channels_last:
@@ -39,13 +39,38 @@ def BTN2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for shape in shape_list:
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            BTN2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        BTN2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                BTN2d(shape, dtype, channels_last, backward)
+                BTN2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -60,20 +85,45 @@ def BTN2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        BTN2d(shape, dtype, channels_last, backward=True)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    BTN2d(shape, dtype, channels_last, backward=True)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/batch_norm_3d.py b/test/microbench/batch_norm_3d.py
index 26b3a0a981..7a02f7cf2f 100644
--- a/test/microbench/batch_norm_3d.py
+++ b/test/microbench/batch_norm_3d.py
@@ -1,14 +1,14 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [(2, 5, 6, 3, 5, 5), (2, 8, 64, 64, 64, 8), (16, 16, 128, 128, 256, 16)]
+backward = True
 
 
-def BTN3d(shape, dtype, channels_last, backward):
+def BTN3d(shape, dtype, channels_last, backward, device):
     N, C, D, H, W, num_features = (
         shape[0],
         shape[1],
@@ -39,13 +39,38 @@ def BTN3d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for shape in shape_list:
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            BTN3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        BTN3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                BTN3d(shape, dtype, channels_last, backward)
+                BTN3d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -60,20 +85,44 @@ def BTN3d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        BTN3d(shape, dtype, channels_last, backward=True)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    BTN3d(shape, dtype, channels_last, backward=True)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/col2im.py b/test/microbench/col2im.py
index 88a0593f2f..29dccdb9db 100644
--- a/test/microbench/col2im.py
+++ b/test/microbench/col2im.py
@@ -1,10 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [
     ((1, 147, 1359556), (1200, 1200)),
     ((1, 147, 36100), (224, 224)),
@@ -13,56 +12,104 @@
 ]
 kernel_size = (7, 7)
 dilation = (6, 6)
-
 backward = True
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape[0], dtype=dtype, device=device, requires_grad=True)
-        if backward:
-            input.requires_grad_(True)
-        output_size = shape[1]
-
-        # warm up
-        output = torch.nn.functional.fold(
-            input, output_size, kernel_size, dilation, 1, 1
-        )
-        if backward:
-            torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
-
-        # go
-        print(
-            "shape:",
-            shape[0],
-            "; datatype:",
-            dtype,
-            "; output_size:",
-            shape[1],
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                output = torch.nn.functional.fold(
-                    input, output_size, kernel_size, dilation, 1, 1
-                )
-                if backward:
-                    torch.autograd.grad(
-                        output, input, grad_outputs=torch.ones_like(output)
-                    )
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Col2im(input, output_size, kernel_size, dilation, backward, device):
+    output = torch.nn.functional.fold(input, output_size, kernel_size, dilation, 1, 1)
+    if backward:
+        torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
+
+
+def run_profile(input, output_size, kernel_size, dilation, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = torch.nn.functional.fold(
-                input, output_size, kernel_size, dilation, 1, 1
+            Col2im(input, output_size, kernel_size, dilation, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, output_size, kernel_size, dilation, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Col2im(input, output_size, kernel_size, dilation, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(
+                shape[0], dtype=dtype, device=args.device, requires_grad=True
             )
             if backward:
-                torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                input.requires_grad_(True)
+            output_size = shape[1]
+            # warm up
+            Col2im(input, output_size, kernel_size, dilation, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                shape[0],
+                "; datatype:",
+                dtype,
+                "; output_size:",
+                shape[1],
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(
+                    input,
+                    output_size,
+                    kernel_size,
+                    dilation,
+                    backward,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    input,
+                    output_size,
+                    kernel_size,
+                    dilation,
+                    backward,
+                    args.device,
+                    args.num_iter,
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distance.cdist.py b/test/microbench/distance.cdist.py
index c7047b86e2..1d638bbb57 100644
--- a/test/microbench/distance.cdist.py
+++ b/test/microbench/distance.cdist.py
@@ -1,11 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [
     ((8, 16), (2, 16)),
     ((10, 8192), (10, 8192)),
@@ -13,60 +11,113 @@
     ((4, 512, 512), (4, 513, 512)),
     ((1, 512, 8192), (1, 1024, 8192)),
 ]
+backward = True
+
+
+def Cdist(input1, input2, backward, p, compute_mode, device):
+    output = torch.cdist(input1, input2, p, compute_mode)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
+
+
+def run_profile(input1, input2, backward, p, compute_mode, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Cdist(input1, input2, backward, p, compute_mode, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input1, input2, backward, p, compute_mode, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Cdist(input1, input2, backward, p, compute_mode, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
 
-for shape in shape_list:
-    for p in [0, 1, 2]:
-        for compute_mode in [
-            "use_mm_for_euclid_dist_if_necessary",
-            "use_mm_for_euclid_dist",
-            "donot_use_mm_for_euclid_dist",
-        ]:
-            for dtype in [torch.float32]:
-                input1 = torch.rand(shape[0], device=device, dtype=dtype)
-                input2 = torch.rand(shape[1], device=device, dtype=dtype)
-                if backward:
-                    input1.requires_grad_(True)
-                    input2.requires_grad_(True)
-
-                # warm up
-                output = torch.cdist(input1, input2, p, compute_mode)
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; P:",
-                    p,
-                    "; mode:",
-                    compute_mode,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        output = torch.cdist(input1, input2, p, compute_mode)
-                        if backward:
-                            gy = torch.empty_like(output)
-                            output.backward(gy)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    output = torch.cdist(input1, input2, p, compute_mode)
+
+def benchmark(args):
+    for shape in shape_list:
+        for p in [0, 1, 2]:
+            for compute_mode in [
+                "use_mm_for_euclid_dist_if_necessary",
+                "use_mm_for_euclid_dist",
+                "donot_use_mm_for_euclid_dist",
+            ]:
+                for dtype in [torch.float32]:
+                    input1 = torch.rand(shape[0], device=args.device, dtype=dtype)
+                    input2 = torch.rand(shape[1], device=args.device, dtype=dtype)
                     if backward:
-                        gy = torch.empty_like(output)
-                        output.backward(gy)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                        input1.requires_grad_(True)
+                        input2.requires_grad_(True)
+                    # warm up
+                    Cdist(input1, input2, backward, p, compute_mode, args.device)
+
+                    # go
+                    print(
+                        "shape:",
+                        (shape),
+                        "; datatype:",
+                        dtype,
+                        "; P:",
+                        p,
+                        "; mode:",
+                        compute_mode,
+                        "; backward:",
+                        backward,
+                    )
+
+                    if not args.e2e_only:
+                        run_profile(
+                            input1,
+                            input2,
+                            backward,
+                            p,
+                            compute_mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            input1,
+                            input2,
+                            backward,
+                            p,
+                            compute_mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distance.pdist.py b/test/microbench/distance.pdist.py
index b0f59dd6ef..7c737347db 100644
--- a/test/microbench/distance.pdist.py
+++ b/test/microbench/distance.pdist.py
@@ -1,46 +1,83 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 forward_shape_list = [(2048, 256), (2048, 8192), (16, 8192 * 4)]
 backward_shape_list = [(256, 256), (256, 8192), (16, 8192 * 4)]
 
-for backward in [False, True]:
-    shape_list = backward_shape_list if backward else forward_shape_list
-    for shape in shape_list:
-        for dtype in [torch.float32]:
-            input = torch.rand(shape, device=device, dtype=dtype)
-            if backward:
-                input.requires_grad_(True)
-
-            # warm up
-            b = torch.nn.functional.pdist(input, 2)
-
-            # go
-            print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(20):
-                    b = torch.nn.functional.pdist(input, 2)
-                    if backward:
-                        gy = torch.empty_like(b)
-                        b.backward(gy)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                b = torch.nn.functional.pdist(input, 2)
+
+def Pdist(input, backward, device):
+    b = torch.nn.functional.pdist(input, 2)
+    if backward:
+        gy = torch.empty_like(b)
+        b.backward(gy)
+
+
+def run_profile(input, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Pdist(input, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Pdist(input, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for backward in [False, True]:
+        shape_list = backward_shape_list if backward else forward_shape_list
+        for shape in shape_list:
+            for dtype in [torch.float32]:
+                input = torch.rand(shape, device=args.device, dtype=dtype)
                 if backward:
-                    gy = torch.empty_like(b)
-                    b.backward(gy)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                    input.requires_grad_(True)
+                # warm up
+                Pdist(input, backward, args.device)
+
+                # go
+                print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
+                if not args.e2e_only:
+                    run_profile(input, backward, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.bernoulli.py b/test/microbench/distribution.bernoulli.py
index 9f36400b32..96d19233a6 100644
--- a/test/microbench/distribution.bernoulli.py
+++ b/test/microbench/distribution.bernoulli.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -5,18 +6,45 @@
 
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
 
-if __name__ == "__main__":
+
+def Bernoulli(input, p, device):
+    input.bernoulli_(p)
+
+
+def run_profile(input, p, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Bernoulli(input, p, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, p, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Bernoulli(input, p, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for p in [0.5, torch.tensor(0.5)]:
-                input = torch.zeros(
-                    shape, dtype=torch.bfloat16, device=torch.device("xpu")
-                )
-
+                input = torch.zeros(shape, dtype=dtype, device=args.device)
                 # warm up
-                input.bernoulli_(0.5)
+                Bernoulli(input, p, args.device)
 
                 # go
                 print(
@@ -29,20 +57,30 @@
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        input.bernoulli_(p)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    input.bernoulli_(p)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(input, p, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, p, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.cauchy.py b/test/microbench/distribution.cauchy.py
index 259ac8ed88..ddea79890e 100644
--- a/test/microbench/distribution.cauchy.py
+++ b/test/microbench/distribution.cauchy.py
@@ -1,34 +1,76 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-        # warm up
-        input.cauchy_()
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                input.cauchy_()
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Cauchy(input, device):
+    input.cauchy_()
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            input.cauchy_()
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Cauchy(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Cauchy(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            # warm up
+            Cauchy(input, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.exponential.py b/test/microbench/distribution.exponential.py
index e3616e5114..c3d30cfca9 100644
--- a/test/microbench/distribution.exponential.py
+++ b/test/microbench/distribution.exponential.py
@@ -1,34 +1,76 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-        # warm up
-        input.exponential_(0.5)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                input.exponential_(0.5)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Exponential(input, device):
+    input.exponential_(0.5)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            input.exponential_(0.5)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Exponential(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Exponential(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            # warm up
+            Exponential(input, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.geometric.py b/test/microbench/distribution.geometric.py
index 095c7d1bae..8c8b9bfabd 100644
--- a/test/microbench/distribution.geometric.py
+++ b/test/microbench/distribution.geometric.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -5,15 +6,44 @@
 
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
 
-if __name__ == "__main__":
+
+def Geometric(input, device):
+    input.geometric_(0.5)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Geometric(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Geometric(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
-
+            input = torch.randn(shape, dtype=torch.bfloat16, device=args.device)
             # warm up
-            input.geometric_(0.5)
+            Geometric(input, args.device)
 
             # go
             print(
@@ -26,20 +56,30 @@
                 "; backward:",
                 backward,
             )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    input.geometric_(0.5)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                input.geometric_(0.5)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.log_normal.py b/test/microbench/distribution.log_normal.py
index 2b081e7cae..ca5c4bdac3 100644
--- a/test/microbench/distribution.log_normal.py
+++ b/test/microbench/distribution.log_normal.py
@@ -1,34 +1,76 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-        # warm up
-        input.log_normal_(128, 128)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                input.log_normal_(128, 128)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Log_normal(input, device):
+    input.log_normal_(128, 128)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            input.log_normal_(128, 128)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Log_normal(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Log_normal(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            # warm up
+            Log_normal(input, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.multinomial.py b/test/microbench/distribution.multinomial.py
index 9ff254dcbc..b0601bb94c 100644
--- a/test/microbench/distribution.multinomial.py
+++ b/test/microbench/distribution.multinomial.py
@@ -1,51 +1,93 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for replacement in [False, True]:
-            for num_samples in [2, 128]:
-                input = torch.randn(shape, dtype=dtype, device=device).abs()
-                # warm up
-                input.multinomial(num_samples, replacement)
-
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; replacement:",
-                    replacement,
-                    "; num_samples:",
-                    num_samples,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[
-                        ProfilerActivity.CPU,
-                        ProfilerActivity.XPU,
-                    ],
-                    record_shapes=True,
-                ) as prof:
-                    for _ in range(num_iter):
-                        input.multinomial(num_samples, replacement)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    input.multinomial(num_samples, replacement)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def Multinomial(input, replacement, num_samples, device):
+    input.multinomial(num_samples, replacement)
+
+
+def run_profile(input, replacement, num_samples, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Multinomial(input, replacement, num_samples, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, replacement, num_samples, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Multinomial(input, replacement, num_samples, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for replacement in [False, True]:
+                for num_samples in [2, 128]:
+                    input = torch.randn(shape, dtype=dtype, device=args.device).abs()
+                    # warm up
+                    Multinomial(input, replacement, num_samples, args.device)
+
+                    # go
+                    print(
+                        "shape:",
+                        (shape),
+                        "; datatype:",
+                        dtype,
+                        "; replacement:",
+                        replacement,
+                        "; num_samples:",
+                        num_samples,
+                        "; backward:",
+                        backward,
+                    )
+                    if not args.e2e_only:
+                        run_profile(
+                            input, replacement, num_samples, args.device, args.num_iter
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            input, replacement, num_samples, args.device, args.num_iter
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.normal.py b/test/microbench/distribution.normal.py
index f8067ebe8d..54112ff0eb 100644
--- a/test/microbench/distribution.normal.py
+++ b/test/microbench/distribution.normal.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -5,32 +6,71 @@
 
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
 
-if __name__ == "__main__":
+
+def Normal(input, device):
+    input.normal_()
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Normal(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Normal(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
-
+            input = torch.randn(shape, dtype=dtype, device=args.device)
             # warm up
-            input.normal_()
+            Normal(input, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    input.normal_()
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                input.normal_()
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.random.py b/test/microbench/distribution.random.py
index b6a7d21831..df861a82de 100644
--- a/test/microbench/distribution.random.py
+++ b/test/microbench/distribution.random.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -5,32 +6,71 @@
 
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
 
-if __name__ == "__main__":
+
+def Random(input, device):
+    input.random_(-(2**8), 2**8)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Random(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Random(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
-
+            input = torch.randn(shape, dtype=dtype, device=args.device)
             # warm up
-            input.random_(-(2**8), 2**8)
+            Random(input, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    input.random_(-(2**8), 2**8)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                input.random_(-(2**8), 2**8)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/distribution.uniform.py b/test/microbench/distribution.uniform.py
index 14ad76f051..25390fb709 100644
--- a/test/microbench/distribution.uniform.py
+++ b/test/microbench/distribution.uniform.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -5,32 +6,71 @@
 
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
 
-if __name__ == "__main__":
+
+def Uniform(input, device):
+    input.uniform_()
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Uniform(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Uniform(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
-
+            input = torch.randn(shape, dtype=dtype, device=args.device)
             # warm up
-            input.uniform_()
+            Uniform(input, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    input.uniform_()
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                input.uniform_()
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/dropout.py b/test/microbench/dropout.py
index c79dcd1a5e..eb9f06cdf1 100644
--- a/test/microbench/dropout.py
+++ b/test/microbench/dropout.py
@@ -1,27 +1,58 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8192, 8192), (16, 1024)]
-num_iter = 20
+backward = True
 
-if __name__ == "__main__":
-    backward = True
+
+def Dropout(input, grad_dpcpp, dropout, backward, device):
+    output = dropout(input)
+    if backward:
+        output.backward(grad_dpcpp)
+
+
+def run_profile(input, grad_dpcpp, dropout, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Dropout(input, grad_dpcpp, dropout, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, grad_dpcpp, dropout, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Dropout(input, grad_dpcpp, dropout, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             H, W = (shape[0], shape[1])
-            input = torch.randn((H, W)).to(dtype=dtype, device="xpu")
+            input = torch.randn((H, W)).to(dtype=dtype, device=args.device)
 
             dropout = torch.nn.Dropout(p=0.5)
-            dropout.to(device="xpu", dtype=dtype)
-            grad_dpcpp = torch.randn((H, W)).to(device="xpu", dtype=dtype)
-            input.requires_grad_(True)
-
-            # warm up
-            output = dropout(input)
+            dropout.to(device=args.device, dtype=dtype)
             if backward:
-                output.backward(grad_dpcpp)
+                grad_dpcpp = torch.randn((H, W)).to(device=args.device, dtype=dtype)
+                input.requires_grad_(True)
+            # warm up
+            Dropout(input, grad_dpcpp, dropout, backward, args.device)
 
             # go
             print(
@@ -34,24 +65,34 @@
                 "; backward:",
                 backward,
             )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    output = dropout(input)
-                    if backward:
-                        output.backward(grad_dpcpp)
-            print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                output = dropout(input)
-                if backward:
-                    output.backward(grad_dpcpp)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(
+                    input, grad_dpcpp, dropout, backward, args.device, args.num_iter
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    input, grad_dpcpp, dropout, backward, args.device, args.num_iter
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/eltwise.add.py b/test/microbench/eltwise.add.py
index e07ea315eb..6b0087e884 100644
--- a/test/microbench/eltwise.add.py
+++ b/test/microbench/eltwise.add.py
@@ -1,56 +1,95 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = False
-num_iter = 20
-
 shape_list = [
     ((8192, 8192), (8192, 8192)),  # contiguous input
     ((100000, 10000), (100000, 10000)),  # non-contiguous input
     ((8190, 8190), (8190, 8190)),  # non-vectorized input
     ((8192, 8192), (0.5)),  # scalar input
 ]
+backward = False
+
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        a = torch.randn(shape[0], dtype=dtype, device=device)
-        if shape[1] == 0.5:
-            b = int(shape[1])
-        else:
-            b = torch.randn(shape[1], dtype=dtype, device=device)
-        if shape[0] == 100000:
-            a = torch.as_strided(a, (8192, 8192), (20000, 2))
-            b = torch.as_strided(b, (8192, 8192), (20000, 2))
-
-        # warm up
-        for i in range(10):
-            output = a + b
-
-        # go
-        print(
-            "shape:",
-            (shape[0], shape[1]),
-            "; datatype:",
-            dtype,
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                output = a + b
-        print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+def Add(a, b, device):
+    output = a + b
+
+
+def run_profile(a, b, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = a + b
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Add(a, b, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(a, b, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Add(a, b, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            a = torch.randn(shape[0], dtype=dtype, device=args.device)
+            if shape[1] == 0.5:
+                b = int(shape[1])
+            else:
+                b = torch.randn(shape[1], dtype=dtype, device=args.device)
+            if shape[0] == 100000:
+                a = torch.as_strided(a, (8192, 8192), (20000, 2))
+                b = torch.as_strided(b, (8192, 8192), (20000, 2))
+            # warm up
+            Add(a, b, args.device)
+
+            # go
+            print(
+                "shape:",
+                (shape[0], shape[1]),
+                "; datatype:",
+                dtype,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(a, b, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(a, b, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/embedding.py b/test/microbench/embedding.py
index 9ee81ad974..a7723a9a72 100644
--- a/test/microbench/embedding.py
+++ b/test/microbench/embedding.py
@@ -1,43 +1,86 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(1024, 8)]
-device = "xpu"
 backward = True
 dict_len = 2500000
 vect_len = 128
 num_iter = 20
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        emb = torch.nn.Embedding(dict_len, vect_len, dtype=dtype, device=device)
-        input = torch.randint(0, dict_len, (1024, 8), device=device)
-        grad = torch.randn(1024, 8, vect_len, dtype=dtype, device=device)
-
-        # warm up
-        output = emb(input)
-        output.backward(grad)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                output = emb(input)
-                output.backward(grad)
-        print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+def Embedding(input, grad, emb, device):
+    output = emb(input)
+    output.backward(grad)
+
+
+def run_profile(input, grad, emb, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = emb(input)
-            output.backward(grad)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Embedding(input, grad, emb, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, grad, emb, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Embedding(input, grad, emb, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            emb = torch.nn.Embedding(
+                dict_len, vect_len, dtype=dtype, device=args.device
+            )
+            input = torch.randint(0, dict_len, (1024, 8), device=args.device)
+            grad = torch.randn(1024, 8, vect_len, dtype=dtype, device=args.device)
+            # warm up
+            Embedding(input, grad, emb, args.device)
+
+            # go
+            print(
+                "shape:", (shape), "; datatype:", shape, dtype, "; backward:", backward
+            )
+            if not args.e2e_only:
+                run_profile(input, grad, emb, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, grad, emb, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/embedding_bag.py b/test/microbench/embedding_bag.py
index 336c4f3067..f121a75fae 100644
--- a/test/microbench/embedding_bag.py
+++ b/test/microbench/embedding_bag.py
@@ -1,68 +1,102 @@
+import argparse
 import random
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+dict_len = 2500000
+vect_len = 128
+batch = 1024
 backward = True
-num_iter = 20
-
-for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-    for reduce in ["max", "mean", "sum"]:
-        dict_len = 2500000
-        vect_len = 128
-        batch = 1024
-
-        emb = torch.nn.EmbeddingBag(
-            dict_len, vect_len, mode=reduce, dtype=dtype, device=device
-        )
-        input = torch.empty([batch], dtype=torch.long, device=device)
-        for i in range(batch):
-            input[i] = random.randint(0, dict_len - 1)
-
-        bag = torch.empty([batch], dtype=torch.long, device=device)
-        for i in range(batch):
-            bag[i] = i
-
-        if backward:
-            grad = torch.randn(batch, vect_len, dtype=dtype, device=device)
-
-        # warm up
-        for i in range(5):
-            output = emb(input, bag)
-            if backward:
-                output.backward(grad)
-
-        # go
-        print(
-            "shape:",
-            (batch),
-            "; datatype:",
-            dtype,
-            "; reduce:",
-            reduce,
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(20):
-                output = emb(input, bag)
-                if backward:
-                    output.backward(grad)
-        print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Embedding_bag(input, bag, grad, emb, backward, device):
+    output = emb(input, bag)
+    if backward:
+        output.backward(grad)
+
+
+def run_profile(input, bag, grad, emb, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = emb(input, bag)
+            Embedding_bag(input, bag, grad, emb, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, bag, grad, emb, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Embedding_bag(input, bag, grad, emb, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+        for reduce in ["max", "mean", "sum"]:
+            input = torch.empty([batch], dtype=torch.long, device=args.device)
+            emb = torch.nn.EmbeddingBag(
+                dict_len, vect_len, mode=reduce, dtype=dtype, device=args.device
+            )
+            for i in range(batch):
+                input[i] = random.randint(0, dict_len - 1)
+
+            bag = torch.empty([batch], dtype=torch.long, device=args.device)
+            for i in range(batch):
+                bag[i] = i
+
             if backward:
-                output.backward(grad)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                grad = torch.randn(batch, vect_len, dtype=dtype, device=args.device)
+            # warm up
+            Embedding_bag(input, bag, grad, emb, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                (batch),
+                "; datatype:",
+                dtype,
+                "; reduce:",
+                reduce,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(input, bag, grad, emb, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, bag, grad, emb, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/flip.py b/test/microbench/flip.py
index 3dd13ad73c..2914731ae9 100644
--- a/test/microbench/flip.py
+++ b/test/microbench/flip.py
@@ -1,10 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [
     ((64, 1024, 1024), (0, 1)),
     ((1024, 64, 1024), (0, 2)),
@@ -13,52 +12,86 @@
     ((16, 128, 512, 512), (0, 3)),
     ((16, 128, 512, 512), (1, 3)),
 ]
-
 backward = True
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape[0], device=device, dtype=dtype)
-
-        if backward:
-            input.requires_grad_(True)
-
-        # warm up
-        output = torch.flip(input, shape[1])
-        if backward:
-            gy = torch.empty_like(output)
-            output.backward(gy)
-
-        # go
-        print(
-            "shape:",
-            shape[0],
-            "; datatype:",
-            dtype,
-            "; dim:",
-            shape[1],
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                output = torch.flip(input, shape[1])
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Flip(input, shape, backward, device):
+    output = torch.flip(input, shape[1])
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
+
+
+def run_profile(input, shape, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = torch.flip(input, shape[1])
+            Flip(input, shape, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, shape, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Flip(input, shape, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], device=args.device, dtype=dtype)
             if backward:
-                gy = torch.empty_like(output)
-                output.backward(gy)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                input.requires_grad_(True)
+            # warm up
+            Flip(input, shape, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                shape[0],
+                "; datatype:",
+                dtype,
+                "; dim:",
+                shape[1],
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(input, shape, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, shape, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/grid_sampler.grid_sampler_2d.py b/test/microbench/grid_sampler.grid_sampler_2d.py
index 2dc3e7332d..c8ba070e5e 100644
--- a/test/microbench/grid_sampler.grid_sampler_2d.py
+++ b/test/microbench/grid_sampler.grid_sampler_2d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,81 +9,133 @@
     (4, 32, 128, 128),
     (16, 128, 512, 512),
 ]
-
-device = "xpu"
 backward = True
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for mode in ["bilinear", "nearest", "bicubic"]:
-            for padding_mode in ["zeros", "border", "reflection"]:
-                for align_corners in [True, False]:
-                    N, C, H, W = shape
-
-                    input = torch.randn(N, C, H, W, dtype=dtype, device=device)
-                    grid = torch.randn(N, H, W, 2, dtype=dtype, device=device)
-
-                    if backward:
-                        input.requires_grad_(True)
-                        grid.requires_grad_(True)
-
-                    # warm up
-                    output = torch.nn.functional.grid_sample(
-                        input,
-                        grid,
-                        mode=mode,
-                        padding_mode=padding_mode,
-                        align_corners=align_corners,
-                    )
-                    if backward:
-                        output.sum().backward()
-
-                    # go
-                    print(
-                        "shape:",
-                        (shape),
-                        "; datatype:",
-                        dtype,
-                        "; mode:",
-                        mode,
-                        "; padding_mode:",
-                        padding_mode,
-                        "; align_corners:",
-                        align_corners,
-                        "; backward:",
-                        backward,
-                    )
-                    with profile(
-                        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                        record_shapes=True,
-                    ) as prof:
-                        for i in range(num_iter):
-                            output = torch.nn.functional.grid_sample(
-                                input,
-                                grid,
-                                mode=mode,
-                                padding_mode=padding_mode,
-                                align_corners=align_corners,
-                            )
-                            if backward:
-                                output.sum().backward()
-                    print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                    # E2E time
-                    torch.xpu.synchronize()
-                    t1 = time.time()
-                    for i in range(num_iter):
-                        output = torch.nn.functional.grid_sample(
+
+
+def Grad_sample2d(input, grid, backward, mode, padding_mode, align_corners, device):
+    output = torch.nn.functional.grid_sample(
+        input,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+    )
+    if backward:
+        output.sum().backward()
+
+
+def run_profile(
+    input, grid, backward, mode, padding_mode, align_corners, device, num_iter
+):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Grad_sample2d(
+                input, grid, backward, mode, padding_mode, align_corners, device
+            )
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, grid, backward, mode, padding_mode, align_corners, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Grad_sample2d(input, grid, backward, mode, padding_mode, align_corners, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for mode in ["bilinear", "nearest", "bicubic"]:
+                for padding_mode in ["zeros", "border", "reflection"]:
+                    for align_corners in [True, False]:
+                        N, C, H, W = shape
+                        input = torch.randn(N, C, H, W, dtype=dtype, device=args.device)
+                        grid = torch.randn(N, H, W, 2, dtype=dtype, device=args.device)
+
+                        if backward:
+                            input.requires_grad_(True)
+                            grid.requires_grad_(True)
+
+                        # warm up
+                        Grad_sample2d(
                             input,
                             grid,
-                            mode=mode,
-                            padding_mode=padding_mode,
-                            align_corners=align_corners,
+                            backward,
+                            mode,
+                            padding_mode,
+                            align_corners,
+                            args.device,
                         )
-                        if backward:
-                            output.sum().backward()
-                    torch.xpu.synchronize()
-                    t2 = time.time()
-                    e2e_time = (t2 - t1) / num_iter
-                    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+                        # go
+                        print(
+                            "shape:",
+                            (shape),
+                            "; datatype:",
+                            dtype,
+                            "; mode:",
+                            mode,
+                            "; padding_mode:",
+                            padding_mode,
+                            "; align_corners:",
+                            align_corners,
+                            "; backward:",
+                            backward,
+                        )
+                        if not args.e2e_only:
+                            run_profile(
+                                input,
+                                grid,
+                                backward,
+                                mode,
+                                padding_mode,
+                                align_corners,
+                                args.device,
+                                args.num_iter,
+                            )
+
+                        if not args.profile_only:
+                            run_e2e(
+                                input,
+                                grid,
+                                backward,
+                                mode,
+                                padding_mode,
+                                align_corners,
+                                args.device,
+                                args.num_iter,
+                            )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/grid_sampler.grid_sampler_3d.py b/test/microbench/grid_sampler.grid_sampler_3d.py
index 41a296fb90..b7841b8847 100644
--- a/test/microbench/grid_sampler.grid_sampler_3d.py
+++ b/test/microbench/grid_sampler.grid_sampler_3d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -7,80 +8,136 @@
     (2, 5, 6, 3, 5),
     (8, 16, 64, 64, 64),
 ]
-
-device = "xpu"
 backward = True
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for mode in ["bilinear", "nearest"]:
-            for padding_mode in ["zeros", "border", "reflection"]:
-                for align_corners in [True, False]:
-                    N, C, D, H, W = shape
-                    input = torch.randn(N, C, D, H, W, dtype=dtype, device=device)
-                    grid = torch.randn(N, D, H, W, 3, dtype=dtype, device=device)
-
-                    if backward:
-                        input.requires_grad_(True)
-                        grid.requires_grad_(True)
-
-                    # warm up
-                    output = torch.nn.functional.grid_sample(
-                        input,
-                        grid,
-                        mode=mode,
-                        padding_mode=padding_mode,
-                        align_corners=align_corners,
-                    )
-                    if backward:
-                        output.sum().backward()
-
-                    # go
-                    print(
-                        "shape:",
-                        (shape),
-                        "; datatype:",
-                        dtype,
-                        "; mode:",
-                        mode,
-                        "; padding_mode:",
-                        padding_mode,
-                        "; align_corners:",
-                        align_corners,
-                        "; backward:",
-                        backward,
-                    )
-                    with profile(
-                        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                        record_shapes=True,
-                    ) as prof:
-                        for i in range(num_iter):
-                            output = torch.nn.functional.grid_sample(
-                                input,
-                                grid,
-                                mode=mode,
-                                padding_mode=padding_mode,
-                                align_corners=align_corners,
-                            )
-                            if backward:
-                                output.sum().backward()
-                    print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                    # E2E time
-                    torch.xpu.synchronize()
-                    t1 = time.time()
-                    for i in range(num_iter):
-                        output = torch.nn.functional.grid_sample(
+
+
+def Grad_sample3d(input, grid, backward, mode, padding_mode, align_corners, device):
+    output = torch.nn.functional.grid_sample(
+        input,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+    )
+    if backward:
+        output.sum().backward()
+
+
+def run_profile(
+    input, grid, backward, mode, padding_mode, align_corners, device, num_iter
+):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Grad_sample3d(
+                input, grid, backward, mode, padding_mode, align_corners, device
+            )
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, grid, backward, mode, padding_mode, align_corners, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Grad_sample3d(input, grid, backward, mode, padding_mode, align_corners, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for mode in ["bilinear", "nearest"]:
+                for padding_mode in ["zeros", "border", "reflection"]:
+                    for align_corners in [True, False]:
+                        N, C, D, H, W = shape
+                        input = torch.randn(
+                            N, C, D, H, W, dtype=dtype, device=args.device
+                        )
+                        grid = torch.randn(
+                            N, D, H, W, 3, dtype=dtype, device=args.device
+                        )
+
+                        if backward:
+                            input.requires_grad_(True)
+                            grid.requires_grad_(True)
+                        # warm up
+                        Grad_sample3d(
                             input,
                             grid,
-                            mode=mode,
-                            padding_mode=padding_mode,
-                            align_corners=align_corners,
+                            backward,
+                            mode,
+                            padding_mode,
+                            align_corners,
+                            args.device,
                         )
-                        if backward:
-                            output.sum().backward()
-                    torch.xpu.synchronize()
-                    t2 = time.time()
-                    e2e_time = (t2 - t1) / num_iter
-                    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+                        # go
+                        print(
+                            "shape:",
+                            (shape),
+                            "; datatype:",
+                            dtype,
+                            "; mode:",
+                            mode,
+                            "; padding_mode:",
+                            padding_mode,
+                            "; align_corners:",
+                            align_corners,
+                            "; backward:",
+                            backward,
+                        )
+                        if not args.e2e_only:
+                            run_profile(
+                                input,
+                                grid,
+                                backward,
+                                mode,
+                                padding_mode,
+                                align_corners,
+                                args.device,
+                                args.num_iter,
+                            )
+
+                        if not args.profile_only:
+                            run_e2e(
+                                input,
+                                grid,
+                                backward,
+                                mode,
+                                padding_mode,
+                                align_corners,
+                                args.device,
+                                args.num_iter,
+                            )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/group_norm.py b/test/microbench/group_norm.py
index ef02aaa749..74efcfd4c4 100644
--- a/test/microbench/group_norm.py
+++ b/test/microbench/group_norm.py
@@ -1,11 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [
     (1, 32, 128, 32, 32),  # all channel for 1 group
     (16, 1024, 128, 32, 32),  # normal shape, big memory
@@ -13,79 +11,113 @@
     (32, 32, 512, 256, 256),  # group_num=32, channel for per group=16,big memory
     (8, 32, 32, 16, 64, 64),  # 3d
 ]
+backward = True
 
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for channels_last in [False, True]:
-            for affine in [False, True]:
-                num_groups = shape[0]
-                shape_input = (shape[1], shape[2], shape[3], shape[4])
-                C = shape[2]
-                memory_format = (
-                    torch.channels_last_3d
-                    if len(shape_input) == 5
-                    else torch.channels_last
-                )
-
-                if channels_last:
-                    input = (
-                        torch.randn(shape_input)
-                        .to(memory_format=memory_format)
-                        .to(device=device, dtype=dtype)
-                    )
-                else:
-                    input = torch.randn(shape_input).to(device=device, dtype=dtype)
+def Group_norm(input, m, backward, device):
+    output = m(input)
+    if backward:
+        grad_out = torch.randn_like(output).to(device)
+        (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out)
 
-                if backward:
-                    input.requires_grad_(True)
 
-                m = torch.nn.GroupNorm(num_groups, C, affine=affine, dtype=dtype).to(
-                    device
-                )
+def run_profile(input, m, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Group_norm(input, m, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
 
-                # warm up
-                for i in range(5):
-                    output = m(input)
+
+def run_e2e(input, m, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Group_norm(input, m, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for channels_last in [False, True]:
+                for affine in [False, True]:
+                    num_groups = shape[0]
+                    shape_input = (shape[1], shape[2], shape[3], shape[4])
+                    C = shape[2]
+                    memory_format = (
+                        torch.channels_last_3d
+                        if len(shape_input) == 5
+                        else torch.channels_last
+                    )
+
+                    if channels_last:
+                        input = (
+                            torch.randn(shape_input)
+                            .to(memory_format=memory_format)
+                            .to(device=args.device, dtype=dtype)
+                        )
+                    else:
+                        input = torch.randn(shape_input).to(
+                            device=args.device, dtype=dtype
+                        )
 
                     if backward:
-                        grad_out = torch.randn_like(output).to(device)
-                        (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out)
-
-                # go
-                print(
-                    "shape:",
-                    (shape_input),
-                    "; datatype:",
-                    dtype,
-                    "; channels_last:",
-                    channels_last,
-                    "; affine:",
-                    affine,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        output = m(input)
-
-                        if backward:
-                            grad_out = torch.randn_like(output).to(device)
-                            (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    output = m(input)
-                    if backward:
-                        grad_out = torch.randn_like(output).to(device)
-                        (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                        input.requires_grad_(True)
+
+                    m = torch.nn.GroupNorm(
+                        num_groups, C, affine=affine, dtype=dtype
+                    ).to(args.device)
+                    # warm up
+                    Group_norm(input, m, backward, args.device)
+
+                    # go
+                    print(
+                        "shape:",
+                        (shape[1], shape[2], shape[3], shape[4]),
+                        "; datatype:",
+                        dtype,
+                        "; channels_last:",
+                        channels_last,
+                        "; affine:",
+                        affine,
+                        "; backward:",
+                        backward,
+                    )
+                    if not args.e2e_only:
+                        run_profile(input, m, backward, args.device, args.num_iter)
+
+                    if not args.profile_only:
+                        run_e2e(input, m, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/im2col.py b/test/microbench/im2col.py
index 5af56d285e..e1b9a6b43c 100644
--- a/test/microbench/im2col.py
+++ b/test/microbench/im2col.py
@@ -1,52 +1,84 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-
 shape_list = [(1, 3, 1200, 1200), (1, 3, 224, 224), (1, 3, 63, 1200), (1, 3, 1200, 63)]
 kernel_size = (7, 7)
 dilation = (6, 6)
-num_iter = 20
 backward = True
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device, requires_grad=backward)
-
-        # warmup
-        output = torch.nn.functional.unfold(
-            input, kernel_size, dilation=dilation, padding=1, stride=1
-        )
-        if backward:
-            torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                output = torch.nn.functional.unfold(
-                    input, kernel_size, dilation=dilation, padding=1, stride=1
-                )
-                if backward:
-                    torch.autograd.grad(
-                        output, input, grad_outputs=torch.ones_like(output)
-                    )
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+def Im2col(input, kernel_size, backward, device):
+    output = torch.nn.functional.unfold(
+        input, kernel_size, dilation=dilation, padding=1, stride=1
+    )
+    if backward:
+        torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
+
+
+def run_profile(input, kernel_size, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = torch.nn.functional.unfold(
-                input, kernel_size, dilation=dilation, padding=1, stride=1
+            Im2col(input, kernel_size, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, kernel_size, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Im2col(input, kernel_size, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(
+                shape, dtype=dtype, device=args.device, requires_grad=backward
             )
-            if backward:
-                torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            # warmup
+            Im2col(input, kernel_size, backward, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, kernel_size, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, kernel_size, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.diag.py b/test/microbench/indexing.diag.py
index 45ff879c96..6ba9ab625e 100644
--- a/test/microbench/indexing.diag.py
+++ b/test/microbench/indexing.diag.py
@@ -1,41 +1,80 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8192), (8192, 8192)]
-device = "xpu"
 backward = False
-num_iter = 20
-
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-
-        # warm up
-        output = torch.diag(input)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                output = torch.diag(input)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Diag(input, device):
+    output = torch.diag(input)
+
+
+def run_profile(input, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w * i
-            output = torch.diag(input)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Diag(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Diag(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+            # warm up
+            Diag(input, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, cache_r, cache_w, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, cache_r, cache_w, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.index.py b/test/microbench/indexing.index.py
index 5827cabfb8..86305f4a55 100644
--- a/test/microbench/indexing.index.py
+++ b/test/microbench/indexing.index.py
@@ -1,88 +1,107 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(4, 15000)]
-device = "xpu"
 backward = False
-num_iter = 20
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for mode in ["with_nonzero", "without_nonzero"]:
-            d = torch.rand(shape, dtype=dtype, device=device)
-            e = torch.rand(shape, dtype=dtype, device=device)
 
-            if mode == "with_nonzero":
-                # warm up
-                for i in range(100):
-                    f = d < e
-                    g = e[f]
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    parser.add_argument("--num_iter", type=int, default=20, help="Number of iterations")
+    parser.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    parser.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
 
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; mode:",
-                    mode,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        f = d < e
-                        g = e[f]
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
+    args = parser.parse_args()
+    return args
 
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
+
+def benchmark(shape, dtype, mode, device, num_iter, do_profile, do_e2e):
+    d = torch.rand(shape, dtype=dtype, device=device)
+    e = torch.rand(shape, dtype=dtype, device=device)
+    f = d < e
+    g = e[f]
+
+    # warm up
+    if mode == "with_nonzero":
+        for i in range(100):
+            f = d < e
+            g = e[f]
+    else:
+        f = torch.linspace(0, 4 - 2, steps=int(4 / 2), device=device).to(torch.long)
+        for i in range(100):
+            g = e[f]
+
+    # go
+    print(
+        "shape:",
+        (shape),
+        "; datatype:",
+        dtype,
+        "; mode:",
+        mode,
+        "; backward:",
+        backward,
+    )
+    if not do_e2e:
+        with profile(
+            activities=[
+                ProfilerActivity.CPU,
+                ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+        ) as prof:
+            for i in range(num_iter):
+                if mode == "with_nonzero":
                     f = d < e
                     g = e[f]
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
-            else:
-                f = torch.linspace(0, 4 - 2, steps=int(4 / 2), device=device).to(
-                    torch.long
-                )
-                # warm up
-                for i in range(100):
+                else:
                     g = e[f]
+        print(prof.key_averages().table(sort_by="xpu_time_total"))
+
+    # E2E time
+    if not do_profile:
+        if device in ["xpu", "cuda"]:
+            torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+        t1 = time.time()
+        for i in range(num_iter):
+            if mode == "with_nonzero":
+                df = d < e
+                g = e[f]
+            else:
+                g = e[f]
+        if device in ["xpu", "cuda"]:
+            torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+        t2 = time.time()
+        e2e_time = (t2 - t1) / num_iter
+        print("E2E total time:", f"{float(e2e_time):.20f}")
 
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; mode:",
-                    mode,
-                    "; backward:",
-                    backward,
+
+def main():
+    args = parse_args()
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for mode in ["with_nonzero", "without_nonzero"]:
+                benchmark(
+                    shape=shape,
+                    dtype=dtype,
+                    mode=mode,
+                    device=args.device,
+                    num_iter=args.num_iter,
+                    do_profile=args.profile_only,
+                    do_e2e=args.e2e_only,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        g = e[f]
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
 
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    g = e[f]
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/microbench/indexing.index_add.py b/test/microbench/indexing.index_add.py
index efc0ee9a07..7438f5543c 100644
--- a/test/microbench/indexing.index_add.py
+++ b/test/microbench/indexing.index_add.py
@@ -1,61 +1,120 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(1024, 1024)]
-device = "xpu"
 backward = False
-num_iter = 20
 step = int(1024 / 2)
-cache_r = torch.randn((8192 * 8192), device=device)
-cache_w = torch.randn((8192 * 8192), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [0, 1]:
-            input = torch.zeros(shape, dtype=dtype, device=device)
-            indices = torch.linspace(0, 1022, steps=step, device=device).to(torch.long)
-            y_0 = torch.ones((512, 1024), dtype=dtype, device=device)
-            y_1 = torch.randn((1024, 512), dtype=dtype, device=device)
-
-            # warm up
-            for i in range(10):
-                output = input.index_add(0, indices, y_0)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w * i
-                    if dim == 0:
-                        output = input.index_add(dim, indices, y_0)
-                    else:
-                        output = input.index_add(dim, indices, y_1)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                if dim == 0:
-                    output = input.index_add(dim, indices, y_0)
-                else:
-                    output = input.index_add(dim, indices, y_1)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def Index_add(input, indices, y_0, y_1, dim, device):
+    if dim == 0:
+        output = input.index_add(dim, indices, y_0)
+    else:
+        output = input.index_add(dim, indices, y_1)
+
+
+def run_profile(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w * i
+            Index_add(input, indices, y_0, y_1, dim, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Index_add(input, indices, y_0, y_1, dim, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [0, 1]:
+                input = torch.zeros(shape, dtype=dtype, device=args.device)
+                indices = torch.linspace(0, 1022, steps=step, device=args.device).to(
+                    torch.long
+                )
+                y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device)
+                y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device)
+                cache_r = torch.randn((8192 * 8192), device=args.device)
+                cache_w = torch.randn((8192 * 8192), device=args.device)
+                # warm up
+                Index_add(input, indices, y_0, y_1, dim, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                    dim,
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        input,
+                        indices,
+                        y_0,
+                        y_1,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        input,
+                        indices,
+                        y_0,
+                        y_1,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.index_copy.py b/test/microbench/indexing.index_copy.py
index b06984e321..05b17cc816 100644
--- a/test/microbench/indexing.index_copy.py
+++ b/test/microbench/indexing.index_copy.py
@@ -1,60 +1,119 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(1024, 1024)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [0, 1]:
-            input = torch.zeros(shape, dtype=dtype, device=device)
-            indices = torch.linspace(0, 1022, steps=512, device=device).to(torch.long)
-            y_0 = torch.ones((512, 1024), dtype=dtype, device=device)
-            y_1 = torch.randn((1024, 512), dtype=dtype, device=device)
-
-            # warm up
-            for i in range(10):
-                output = input.index_copy(0, indices, y_0)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w * i
-                    if dim == 0:
-                        output = input.index_copy(dim, indices, y_0)
-                    else:
-                        output = input.index_copy(dim, indices, y_1)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                if dim == 0:
-                    output = input.index_copy(dim, indices, y_0)
-                else:
-                    output = input.index_copy(dim, indices, y_1)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def Index_copy(input, indices, y_0, y_1, dim, device):
+    if dim == 0:
+        output = input.index_copy(dim, indices, y_0)
+    else:
+        output = input.index_copy(dim, indices, y_1)
+
+
+def run_profile(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w * i
+            Index_copy(input, indices, y_0, y_1, dim, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, y_0, y_1, dim, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Index_copy(input, indices, y_0, y_1, dim, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [0, 1]:
+                input = torch.zeros(shape, dtype=dtype, device=args.device)
+                indices = torch.linspace(0, 1022, steps=512, device=args.device).to(
+                    torch.long
+                )
+                y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device)
+                y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device)
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+                # warm up
+                Index_copy(input, indices, y_0, y_1, dim, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        input,
+                        indices,
+                        y_0,
+                        y_1,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        input,
+                        indices,
+                        y_0,
+                        y_1,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.index_fill.py b/test/microbench/indexing.index_fill.py
index 3dd6309692..fbda4276c4 100644
--- a/test/microbench/indexing.index_fill.py
+++ b/test/microbench/indexing.index_fill.py
@@ -1,60 +1,114 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(1024, 1024)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [0, 1]:
-            input = torch.zeros(shape, dtype=dtype, device=device)
-            indices = torch.linspace(0, 1022, steps=512, device=device).to(torch.long)
-            y_0 = torch.ones((512, 1024), dtype=dtype, device=device)
-            y_1 = torch.randn((1024, 512), dtype=dtype, device=device)
-
-            # warm up
-            for i in range(10):
-                output = input.index_fill(0, indices, 1)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w * i
-                    if dim == 0:
-                        output = input.index_fill(dim, indices, 1)
-                    else:
-                        output = input.index_fill(dim, indices, 2)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                if dim == 0:
-                    output = input.index_fill(dim, indices, 1)
-                else:
-                    output = input.index_fill(dim, indices, 2)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def Index_fill(input, indices, dim, device):
+    if dim == 0:
+        output = input.index_fill(dim, indices, 1)
+    else:
+        output = input.index_fill(dim, indices, 2)
+
+
+def run_profile(input, indices, dim, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w * i
+            Index_fill(input, indices, dim, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, dim, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Index_fill(input, indices, dim, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [0, 1]:
+                input = torch.zeros(shape, dtype=dtype, device=args.device)
+                indices = torch.linspace(0, 1022, steps=512, device=args.device).to(
+                    torch.long
+                )
+                y_0 = torch.ones((512, 1024), dtype=dtype, device=args.device)
+                y_1 = torch.randn((1024, 512), dtype=dtype, device=args.device)
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+                # warm up
+                Index_fill(input, indices, dim, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        input,
+                        indices,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        input,
+                        indices,
+                        dim,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.index_put.py b/test/microbench/indexing.index_put.py
index b0ea89e954..8306b3b0a4 100644
--- a/test/microbench/indexing.index_put.py
+++ b/test/microbench/indexing.index_put.py
@@ -1,88 +1,107 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(4, 15000)]
-device = "xpu"
 backward = False
-num_iter = 20
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for mode in ["with_nonzero", "without_nonzero"]:
-            d = torch.rand(4, 15000, dtype=dtype, device=device)
-            e = torch.rand(4, 15000, dtype=dtype, device=device)
-            f = d < e
-            g = e[f]
 
-            if mode == "with_nonzero":
-                # warm up
-                for i in range(100):
-                    d[f] = g
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    parser.add_argument("--num_iter", type=int, default=20, help="Number of iterations")
+    parser.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    parser.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+
+    args = parser.parse_args()
+    return args
 
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; mode:",
-                    mode,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        d[f] = g
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
 
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
+def benchmark(shape, dtype, mode, device, num_iter, do_profile, do_e2e):
+    d = torch.rand(4, 15000, dtype=dtype, device=device)
+    e = torch.rand(4, 15000, dtype=dtype, device=device)
+    f = d < e
+    g = e[f]
+
+    # warm up
+    if mode == "with_nonzero":
+        for i in range(100):
+            d[f] = g
+    else:
+        f = f.nonzero()
+        index = []
+        for i in range(f.dim()):
+            index.append(f.select(1, i))
+        for i in range(100):
+            d[index] = g
+
+    # go
+    print(
+        "shape:",
+        (shape),
+        "; datatype:",
+        dtype,
+        "; mode:",
+        mode,
+        "; backward:",
+        backward,
+    )
+    if not do_e2e:
+        with profile(
+            activities=[
+                ProfilerActivity.CPU,
+                ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+        ) as prof:
+            for i in range(num_iter):
+                if mode == "with_nonzero":
                     d[f] = g
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
-            else:
-                f = f.nonzero()
-                index = []
-                for i in range(f.dim()):
-                    index.append(f.select(1, i))
-                # warm up
-                for i in range(100):
+                else:
                     d[index] = g
+        print(prof.key_averages().table(sort_by="xpu_time_total"))
 
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; mode:",
-                    mode,
-                    "; backward:",
-                    backward,
+    # E2E time
+    if not do_profile:
+        if device in ["xpu", "cuda"]:
+            torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+        t1 = time.time()
+        for i in range(num_iter):
+            if mode == "with_nonzero":
+                d[f] = g
+            else:
+                d[index] = g
+        if device in ["xpu", "cuda"]:
+            torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+        t2 = time.time()
+        e2e_time = (t2 - t1) / num_iter
+        print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def main():
+    args = parse_args()
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for mode in ["with_nonzero", "without_nonzero"]:
+                benchmark(
+                    shape=shape,
+                    dtype=dtype,
+                    mode=mode,
+                    device=args.device,
+                    num_iter=args.num_iter,
+                    do_profile=args.profile_only,
+                    do_e2e=args.e2e_only,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        d[index] = g
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
 
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    d[index] = g
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/microbench/indexing.index_select.py b/test/microbench/indexing.index_select.py
index bb22752429..a953b34d98 100644
--- a/test/microbench/indexing.index_select.py
+++ b/test/microbench/indexing.index_select.py
@@ -1,46 +1,87 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(1024, 1024), (8192, 8192)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        main_size = shape[0]
-        step = int(main_size / 2)
-        input = torch.randn(shape, dtype=dtype, device=device)
-        indices = torch.linspace(0, shape[0] - 2, steps=step, device=device).to(
-            torch.long
-        )
-
-        # warm up
-        for i in range(10):
-            y_0 = torch.index_select(input, 0, indices)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                y_0 = torch.index_select(input, 0, indices)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Index_select(input, indices, device):
+    y_0 = torch.index_select(input, 0, indices)
+
+
+def run_profile(input, indices, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w * i
-            y_0 = torch.index_select(input, 0, indices)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Index_select(input, indices, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Index_select(input, indices, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            main_size = shape[0]
+            step = int(main_size / 2)
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            indices = torch.linspace(
+                0, shape[0] - 2, steps=step, device=args.device
+            ).to(torch.long)
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+            # warm up
+            Index_select(input, indices, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(
+                    input, indices, cache_r, cache_w, args.device, args.num_iter
+                )
+
+            if not args.profile_only:
+                run_e2e(input, indices, cache_r, cache_w, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.masked_fill.py b/test/microbench/indexing.masked_fill.py
index 9d3eb4864a..bd62c29099 100644
--- a/test/microbench/indexing.masked_fill.py
+++ b/test/microbench/indexing.masked_fill.py
@@ -1,45 +1,87 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8192, 8192)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.zeros(shape, dtype=dtype, device=device)
-        masks_ = torch.zeros((8192), dtype=dtype, device=device)
-        indices = torch.linspace(0, 8190, steps=4096, device=device).to(torch.long)
-        masks_.index_fill_(0, indices, True)
-        masks = masks_.to(torch.bool)
-
-        # warm up
-        for i in range(10):
-            y_1 = input.masked_fill(mask=masks, value=1)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                y_1 = input.masked_fill(mask=masks, value=1)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Masked_fill(input, masks, device):
+    y_1 = input.masked_fill(mask=masks, value=1)
+
+
+def run_profile(input, masks, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w * i
-            y_1 = input.masked_fill(mask=masks, value=1)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Masked_fill(input, masks, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, masks, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Masked_fill(input, masks, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.zeros(shape, dtype=dtype, device=args.device)
+            masks_ = torch.zeros((8192), dtype=dtype, device=args.device)
+            indices = torch.linspace(0, 8190, steps=4096, device=args.device).to(
+                torch.long
+            )
+            masks_.index_fill_(0, indices, True)
+            masks = masks_.to(torch.bool)
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+            # warm up
+            Masked_fill(input, masks, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, masks, cache_r, cache_w, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, masks, cache_r, cache_w, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.put.py b/test/microbench/indexing.put.py
index cb19375890..d24765ff71 100644
--- a/test/microbench/indexing.put.py
+++ b/test/microbench/indexing.put.py
@@ -1,45 +1,101 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8192, 8192)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.zeros(shape, dtype=dtype, device=device)
-        indices = torch.linspace(0, 8190 * 8190, steps=4096 * 4096, device=device).to(
-            torch.long
-        )
-        sources = torch.ones((4096, 4096), dtype=dtype, device=device)
-
-        # warm up
-        for i in range(10):
-            input.put_(index=indices, source=sources)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                input.put_(index=indices, source=sources)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Put(input, indices, sources, device):
+    input.put_(index=indices, source=sources)
+
+
+def run_profile(input, indices, sources, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w * i
-            input.put_(index=indices, source=sources)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Put(input, indices, sources, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, sources, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Put(input, indices, sources, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.zeros(shape, dtype=dtype, device=args.device)
+            indices = torch.linspace(
+                0, 8190 * 8190, steps=4096 * 4096, device=args.device
+            ).to(torch.long)
+            sources = torch.ones((4096, 4096), dtype=dtype, device=args.device)
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+            # warm up
+            Put(input, indices, sources, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(
+                    input,
+                    indices,
+                    sources,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    input,
+                    indices,
+                    sources,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/indexing.take.py b/test/microbench/indexing.take.py
index 13163631a4..4d6fca2123 100644
--- a/test/microbench/indexing.take.py
+++ b/test/microbench/indexing.take.py
@@ -1,44 +1,86 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8192, 8192)]
-device = "xpu"
 backward = False
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-        indices = torch.linspace(0, 8190 * 8190, steps=4096 * 4096, device=device).to(
-            torch.long
-        )
-
-        # warm up
-        for i in range(10):
-            output = torch.take(input, indices)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                output = torch.take(input, indices)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Take(input, indices, device):
+    output = torch.take(input, indices)
+
+
+def run_profile(input, indices, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w * i
-            output = torch.take(input, indices)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Take(input, indices, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, indices, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Take(input, indices, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            indices = torch.linspace(
+                0, 8190 * 8190, steps=4096 * 4096, device=args.device
+            ).to(torch.long)
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+            # warm up
+            Take(input, indices, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(
+                    input, indices, cache_r, cache_w, args.device, args.num_iter
+                )
+
+            if not args.profile_only:
+                run_e2e(input, indices, cache_r, cache_w, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/layer_norm.py b/test/microbench/layer_norm.py
index 67a3ef502f..909185ff25 100644
--- a/test/microbench/layer_norm.py
+++ b/test/microbench/layer_norm.py
@@ -1,66 +1,97 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
-
 shape_list = [
     ((1, 1024), (1024)),
     ((2, 4096, 320), (4096, 320)),
     ((512, 3136, 128), (3136, 128)),
     ((128, 49, 196, 1024), (49, 196, 1024)),
 ]
+backward = True
+
 
+def Layer_norm(input, m, backward, device):
+    output = m(input)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape[0], device=device, dtype=dtype)
-
-        if backward:
-            input.requires_grad_(True)
-
-        # warm up
-        m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype)
-        output = m(input)
-        if backward:
-            gy = torch.empty_like(output)
-            output.backward(gy)
-
-        # go
-        print(
-            "shape:",
-            shape[0],
-            "; datatype:",
-            dtype,
-            "; dim:",
-            shape[1],
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype)
-                output = m(input)
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+def run_profile(input, m, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype)
-            output = m(input)
+            Layer_norm(input, m, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, m, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Layer_norm(input, m, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], device=args.device, dtype=dtype)
             if backward:
-                gy = torch.empty_like(output)
-                output.backward(gy)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                input.requires_grad_(True)
+
+            m = torch.nn.LayerNorm(shape[1], device=args.device, dtype=dtype)
+            # warm up
+            Layer_norm(input, m, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                shape[0],
+                "; datatype:",
+                dtype,
+                "; dim:",
+                shape[1],
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(input, m, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, m, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.binary_cross_entropy.py b/test/microbench/loss.binary_cross_entropy.py
index e6e3a942ac..f30217906a 100644
--- a/test/microbench/loss.binary_cross_entropy.py
+++ b/test/microbench/loss.binary_cross_entropy.py
@@ -1,16 +1,12 @@
+import argparse
 import time
 
 import torch
 import torch.nn as nn
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [(8733, 8733), (8733, 513), (513, 8733), (8192, 8192)]
-
-cache_r = torch.randn(1024 * 1024 * 1024, device=device)
-cache_w = torch.randn(1024 * 1024 * 1024, device=device)
+backward = True
 
 
 def _do_test(loss, input, target, dtype, device):
@@ -21,50 +17,107 @@ def _do_test(loss, input, target, dtype, device):
     return output, grad_inputs
 
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        M, N = shape[0], shape[1]
-        input = torch.randn((M, N), requires_grad=True)
-        target = torch.empty((M, N)).random_(2)
-        for reduce in ["none", "mean", "sum"]:
-            loss = nn.BCELoss(reduce=reduce)
-            m = nn.Sigmoid()
-            input = m(input).to(dtype=dtype, device=device)
-            target = target.to(dtype=dtype, device=device)
-            # warm up
+def run_profile(
+    loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter
+):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w + 1
             _do_test(loss, input, target, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
 
-            # go
-            print(
-                "shape:",
-                (M, N),
-                "; datatype:",
-                dtype,
-                "; reduce:",
-                reduce,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w + 1
-                    output_xpu, grad_input_xpu = _do_test(
-                        loss, input, target, dtype, device
-                    )
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w + 1
-                output_xpu, grad_input_xpu = _do_test(
-                    loss, input, target, dtype, device
+
+def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w + 1
+        _do_test(loss, input, target, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            M, N = shape[0], shape[1]
+            input = torch.randn((M, N), requires_grad=True)
+            target = torch.empty((M, N)).random_(2)
+            cache_r = torch.randn(1024 * 1024 * 1024, device=args.device)
+            cache_w = torch.randn(1024 * 1024 * 1024, device=args.device)
+            for reduce in ["none", "mean", "sum"]:
+                loss = nn.BCELoss(reduce=reduce)
+                m = nn.Sigmoid()
+                input = m(input).to(dtype=dtype, device=args.device)
+                target = target.to(dtype=dtype, device=args.device)
+                # warm up
+                _do_test(loss, input, target, dtype, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (M, N),
+                    "; datatype:",
+                    dtype,
+                    "; reduce:",
+                    reduce,
+                    "; backward:",
+                    backward,
                 )
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.ctc_loss.py b/test/microbench/loss.ctc_loss.py
index a228663352..0026c41992 100644
--- a/test/microbench/loss.ctc_loss.py
+++ b/test/microbench/loss.ctc_loss.py
@@ -1,80 +1,128 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 # T,N,C,S
 shape_list = [(32, 32, 32, 16), (128, 128, 128, 128), (8, 8, 4, 8)]
+backward = True
 
 
-def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype):
-    log_probs_dpcpp = log_probs.to("xpu")
-    log_probs_dpcpp.requires_grad_(True)
-    targets_dpcpp = targets.to("xpu")
-    input_lengths_dpcpp = input_lengths.to("xpu")
-    target_lengths_dpcpp = target_lengths.to("xpu")
-
-    # warm up
+def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward):
     loss_dpcpp = torch.nn.functional.ctc_loss(
-        log_probs_dpcpp, targets_dpcpp, input_lengths_dpcpp, target_lengths_dpcpp
-    )
-    loss_dpcpp.backward()
-
-    # go
-    print(
-        "shape:",
-        (shape[0], shape[1], shape[2], shape[3]),
-        "; datatype:",
-        dtype,
-        "; backward:",
-        backward,
+        log_probs, targets, input_lengths, target_lengths
     )
+    if backward:
+        loss_dpcpp.backward()
+
+
+def run_profile(
+    log_probs, targets, input_lengths, target_lengths, backward, device, num_iter
+):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
     ) as prof:
         for i in range(num_iter):
-            loss_dpcpp = torch.nn.functional.ctc_loss(
-                log_probs_dpcpp,
-                targets_dpcpp,
-                input_lengths_dpcpp,
-                target_lengths_dpcpp,
-            )
-            loss_dpcpp.backward()
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+            _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
-    # E2E time
-    torch.xpu.synchronize()
+def run_e2e(
+    log_probs, targets, input_lengths, target_lengths, backward, device, num_iter
+):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
-        loss_dpcpp = torch.nn.functional.ctc_loss(
-            log_probs_dpcpp,
-            targets_dpcpp,
-            input_lengths_dpcpp,
-            target_lengths_dpcpp,
-        )
-        loss_dpcpp.backward()
-    torch.xpu.synchronize()
+        _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
 
-for shape in shape_list:
-    for dtype in [torch.float32]:
-        T, N, C, S = shape[0], shape[1], shape[2], shape[3]
-        g_cpu = torch.Generator()
-        g_cpu.manual_seed(15)
-        torch.manual_seed(15)
-        log_probs = (
-            torch.randn(T, N, C, dtype=dtype).log_softmax(2).detach().requires_grad_()
-        )
-        targets = torch.randint(1, N, (N, S), dtype=torch.long, generator=g_cpu)
-        input_lengths = torch.full((N,), T, dtype=torch.long)
-        target_lengths = torch.randint(1, S, (N,), dtype=torch.long, generator=g_cpu)
-        _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype)
-        g_cpu = torch.Generator()
-        g_cpu.manual_seed(15)
-        torch.manual_seed(15)
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.float32]:
+            T, N, C, S = shape[0], shape[1], shape[2], shape[3]
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(15)
+            torch.manual_seed(15)
+            log_probs = (
+                torch.randn(T, N, C, dtype=dtype, device=args.device)
+                .log_softmax(2)
+                .detach()
+                .requires_grad_()
+            )
+            targets = torch.randint(1, N, (N, S), dtype=torch.long, device=args.device)
+            input_lengths = torch.full((N,), T, dtype=torch.long, device=args.device)
+            target_lengths = torch.randint(
+                1, S, (N,), dtype=torch.long, device=args.device
+            )
+
+            if backward:
+                log_probs.requires_grad_(True)
+
+            # warm up
+            _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+            # go
+            print(
+                "shape:",
+                (shape[0], shape[1], shape[2], shape[3]),
+                "; datatype:",
+                dtype,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(
+                    log_probs,
+                    targets,
+                    input_lengths,
+                    target_lengths,
+                    backward,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    log_probs,
+                    targets,
+                    input_lengths,
+                    target_lengths,
+                    backward,
+                    args.device,
+                    args.num_iter,
+                )
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(15)
+            torch.manual_seed(15)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.l1_loss.py b/test/microbench/loss.l1_loss.py
index 3d02e097fb..06d228ed12 100644
--- a/test/microbench/loss.l1_loss.py
+++ b/test/microbench/loss.l1_loss.py
@@ -1,68 +1,123 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [
     (8732, 8732),
     (8192, 8732),
 ]
+backward = True
 
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for reduce in ["none", "mean"]:
-    for shape in shape_list:
-        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            B = shape[0]
-            S = shape[1]
-            input = torch.randn((B, S), requires_grad=True).to(
-                dtype=dtype, device=device
-            )
-            target = torch.randn((B, S)).to(dtype=dtype, device=device)
-            loss = torch.nn.L1Loss(reduction=reduce)
-
-            # warm up
-            output_xpu = loss(input, target)
-            output_xpu.backward(torch.ones_like(output_xpu, dtype=dtype, device=device))
-
-            # go
-            print(
-                "shape:",
-                (B, S),
-                "; datatype:",
-                dtype,
-                "; backward:",
-                backward,
-                "; reduce: 0" if (reduce == "none") else "; reduce: 1",
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w * i
-                    output_xpu = loss(input, target)
-                    cache_r = cache_w * i
-                    output_xpu.backward(
-                        torch.ones_like(output_xpu, dtype=dtype, device=device)
-                    )
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                output_xpu = loss(input, target)
-                cache_r = cache_w * i
-                output_xpu.backward(
-                    torch.ones_like(output_xpu, dtype=dtype, device=device)
+
+def L1_loss(loss, input, target, dtype, backward, device):
+    output_xpu = loss(input, target)
+    if backward:
+        output_xpu.backward(torch.ones_like(output_xpu, dtype=dtype, device=device))
+
+
+def run_profile(
+    loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter
+):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w * i
+            L1_loss(loss, input, target, dtype, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        L1_loss(loss, input, target, dtype, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for reduce in ["none", "mean"]:
+        for shape in shape_list:
+            for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+                B = shape[0]
+                S = shape[1]
+                input = torch.randn((B, S), requires_grad=True).to(
+                    dtype=dtype, device=args.device
                 )
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                target = torch.randn((B, S)).to(dtype=dtype, device=args.device)
+                loss = torch.nn.L1Loss(reduction=reduce)
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+                # warm up
+                L1_loss(loss, input, target, dtype, backward, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (B, S),
+                    "; datatype:",
+                    dtype,
+                    "; backward:",
+                    backward,
+                    "; reduce: 0" if (reduce == "none") else "; reduce: 1",
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.mse_loss.py b/test/microbench/loss.mse_loss.py
index ae4a5394f2..f5d0f2b915 100644
--- a/test/microbench/loss.mse_loss.py
+++ b/test/microbench/loss.mse_loss.py
@@ -1,68 +1,119 @@
+import argparse
 import time
 
 import torch
 import torch.nn as nn
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
 shape_list = [(8192, 8192)]
-num_iter = 20
-
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
+backward = True
 
-def _do_test(loss, input, target, dtype, device):
-    input = input.to(dtype=dtype, device=device)
-    target = target.to(dtype=dtype, device=device)
 
+def Mse_loss(loss, input, target, dtype, device):
     output = loss(input, target)
     grad_output = torch.ones_like(output, dtype=dtype, device=device)
-    grad_inputs = torch.autograd.grad(output, input, grad_output)
-
-    # warm up
-    output = loss(input, target)
     output.backward(grad_output)
 
-    # go
-    print(
-        "shape:",
-        (shape),
-        "; datatype:",
-        dtype,
-        "; backward:",
-        backward,
-        "; reduce: 0" if (reduce == "none") else "; reduce: 1",
-    )
+
+def run_profile(
+    loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter
+):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
     ) as prof:
         for i in range(num_iter):
             cache_r = cache_w
-            output = loss(input, target)
-            cache_r = cache_w
-            output.backward(grad_output)
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+            Mse_loss(loss, input, target, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
-    # E2E time
-    torch.xpu.synchronize()
+def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
         cache_r = cache_w
-        output = loss(input, target)
-        cache_r = cache_w
-        output.backward(grad_output)
-    torch.xpu.synchronize()
+        Mse_loss(loss, input, target, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for reduce in ["none", "mean"]:
-            input = torch.randn(shape, requires_grad=True)
-            target = torch.randn(shape)
-            loss = nn.MSELoss(reduction=reduce)
-            _do_test(loss, input, target, dtype, device)
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for reduce in ["none", "mean"]:
+                input = torch.randn(shape, requires_grad=True)
+                target = torch.randn(shape)
+                input = input.to(dtype=dtype, device=args.device)
+                target = target.to(dtype=dtype, device=args.device)
+                loss = nn.MSELoss(reduction=reduce)
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+                # warm up
+                Mse_loss(loss, input, target, dtype, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; backward:",
+                    backward,
+                    "; reduce: 0" if (reduce == "none") else "; reduce: 1",
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        backward,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.multilabel_margin_loss.py b/test/microbench/loss.multilabel_margin_loss.py
index 85609d6785..9da9b5ce3f 100644
--- a/test/microbench/loss.multilabel_margin_loss.py
+++ b/test/microbench/loss.multilabel_margin_loss.py
@@ -1,104 +1,119 @@
+import argparse
 import time
 
 import torch
 import torch.nn as nn
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
 shape_list = [(8192, 8192)]
-num_iter = 20
+backward = True
 
-cache_r = torch.randn((1024 * 1024 * 1024), device="xpu")
-cache_w = torch.randn((1024 * 1024 * 1024), device="xpu")
 
+def Margin_loss(loss, input, target, reduce, dtype, device):
+    output = loss(input, target)
+    if reduce == "none":
+        output.backward(torch.ones_like(output, dtype=dtype).to(device))
+    else:
+        output.backward(torch.tensor((1.0), dtype=dtype).to(device))
 
-def _test_dpcpp(input, target, reduce, dtype):
-    loss = nn.MultiLabelMarginLoss(reduction=reduce)
-    input.requires_grad = True
 
-    if reduce == "none":
-        # warm up
-        output = loss(input, target)
-        output.backward(torch.ones_like(output, dtype=dtype).to("xpu"))
-
-        # go
-        print(
-            "shape:",
-            (shape),
-            "; datatype:",
-            dtype,
-            "; backward:",
-            backward,
-            "; reduce: 0" if (reduce == "none") else "; reduce: 1",
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w
-                output = loss(input, target)
-                cache_r = cache_w
-                output.backward(torch.ones_like(output, dtype=dtype).to("xpu"))
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+def run_profile(loss, input, target, reduce, dtype, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w
-            output = loss(input, target)
-            cache_r = cache_w
-            output.backward(torch.ones_like(output, dtype=dtype).to("xpu"))
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Margin_loss(loss, input, target, reduce, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
 
-    else:
-        # warm up
-        output = loss(input, target)
-        output.backward(torch.tensor((1.0), dtype=dtype).to("xpu"))
-
-        # go
-        print(
-            "shape:",
-            (shape),
-            "; datatype:",
-            dtype,
-            "; backward:",
-            backward,
-            "; reduce: 0" if (reduce == "none") else "; reduce: 1",
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w
-                output = loss(input, target)
-                cache_r = cache_w
-                output.backward(torch.tensor((1.0), dtype=dtype).to("xpu"))
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
-        for i in range(num_iter):
-            cache_r = cache_w
-            output = loss(input, target)
-            cache_r = cache_w
-            output.backward(torch.tensor((1.0), dtype=dtype).to("xpu"))
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
-
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for reduce in ["none", "mean"]:
-            input = torch.randn(shape, dtype=dtype)
-            target = torch.randn(shape, dtype=dtype).long()
-            input_dpcpp = input.to("xpu")
-            target_dpcpp = target.to("xpu")
-            _test_dpcpp(input_dpcpp, target_dpcpp, reduce, dtype)
+
+def run_e2e(loss, input, target, reduce, dtype, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w
+        Margin_loss(loss, input, target, reduce, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for reduce in ["none", "mean"]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+                target = torch.randn(shape, dtype=dtype, device=args.device).long()
+                input.requires_grad = True
+                loss = nn.MultiLabelMarginLoss(reduction=reduce)
+
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+                # warm up
+                Margin_loss(loss, input, target, reduce, dtype, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; backward:",
+                    backward,
+                    "; reduce: 0" if (reduce == "none") else "; reduce: 1",
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        loss,
+                        input,
+                        target,
+                        reduce,
+                        dtype,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        loss,
+                        input,
+                        target,
+                        reduce,
+                        dtype,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.nll_loss.py b/test/microbench/loss.nll_loss.py
index 6ee7ed9a9f..8484c15c76 100644
--- a/test/microbench/loss.nll_loss.py
+++ b/test/microbench/loss.nll_loss.py
@@ -1,51 +1,110 @@
+import argparse
 import time
 
 import torch
 import torch.nn.functional as F
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
 shape_list = [(8192, 8192)]
-num_iter = 20
-
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape).to(device).to(dtype)
-        target = torch.empty(shape[0], dtype=torch.long).to(device)
-        for i in range(8192):
-            target[i] = i
-        x = torch.tensor(0.5).to(device).to(dtype)
-        input.requires_grad = True
-
-        # warm up
-        output = F.nll_loss(input, target)
-        output.backward(x)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                cache_r = cache_w
-                output = F.nll_loss(input, target)
-                cache_r = cache_w
-                output.backward(x)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+backward = True
+
+
+def Nll_loss(loss, input, x, target, dtype, device):
+    output = loss(input, target)
+    output.backward(x)
+
+
+def run_profile(loss, input, x, target, dtype, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
             cache_r = cache_w
-            output = F.nll_loss(input, target)
-            cache_r = cache_w
-            output.backward(x)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Nll_loss(loss, input, x, target, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(loss, input, x, target, dtype, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w
+        Nll_loss(loss, input, x, target, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape).to(args.device).to(dtype)
+            target = torch.empty(shape[0], dtype=torch.long).to(args.device)
+            for i in range(8192):
+                target[i] = i
+            input.requires_grad = True
+            loss = F.nll_loss
+            x = torch.tensor(0.5).to(args.device).to(dtype)
+
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+            # warm up
+            Nll_loss(loss, input, x, target, dtype, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(
+                    loss,
+                    input,
+                    x,
+                    target,
+                    dtype,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    loss,
+                    input,
+                    x,
+                    target,
+                    dtype,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/loss.smooth_l1_loss.py b/test/microbench/loss.smooth_l1_loss.py
index b71739d9e8..17c1357c62 100644
--- a/test/microbench/loss.smooth_l1_loss.py
+++ b/test/microbench/loss.smooth_l1_loss.py
@@ -1,64 +1,116 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [(8732, 8732), (8192, 8732)]
-cache_r = torch.randn((1024 * 1024 * 1024), device=device)
-cache_w = torch.randn((1024 * 1024 * 1024), device=device)
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for reduce in ["none", "mean"]:
-            B = shape[0]
-            S = shape[1]
-            input = torch.randn((B, S), requires_grad=True).to(
-                dtype=dtype, device=device
-            )
-            target = torch.randn((B, S)).to(dtype=dtype, device=device)
-            loss = torch.nn.SmoothL1Loss(reduction=reduce)
-
-            # warm up
-            output = loss(input, target)
-            output.backward(torch.ones_like(output, dtype=dtype, device=device))
-
-            # go
-            print(
-                "shape:",
-                (B, S),
-                "; datatype:",
-                dtype,
-                "; backward:",
-                backward,
-                "; reduce: 0" if (reduce == "none") else "; reduce: 1",
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w * i
-                    output = loss(input, target)
-                    cache_r = cache_w * i
-                    output.backward(
-                        torch.ones_like(output, dtype=torch.float, device=device)
-                    )
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w * i
-                output = loss(input, target)
-                cache_r = cache_w * i
-                output.backward(
-                    torch.ones_like(output, dtype=torch.float, device=device)
+backward = True
+
+
+def Smooth_l1_loss(loss, input, target, dtype, device):
+    output = loss(input, target)
+    output.backward(torch.ones_like(output, dtype=dtype, device=device))
+
+
+def run_profile(loss, input, target, dtype, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            cache_r = cache_w * i
+            Smooth_l1_loss(loss, input, target, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(loss, input, target, dtype, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        cache_r = cache_w * i
+        Smooth_l1_loss(loss, input, target, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for reduce in ["none", "mean"]:
+                B = shape[0]
+                S = shape[1]
+                input = torch.randn((B, S), requires_grad=True).to(
+                    dtype=dtype, device=args.device
                 )
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                target = torch.randn((B, S)).to(dtype=dtype, device=args.device)
+                loss = torch.nn.SmoothL1Loss(reduction=reduce)
+
+                cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+                cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+                # warm up
+                Smooth_l1_loss(loss, input, target, dtype, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (B, S),
+                    "; datatype:",
+                    dtype,
+                    "; backward:",
+                    backward,
+                    "; reduce: 0" if (reduce == "none") else "; reduce: 1",
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        loss,
+                        input,
+                        target,
+                        dtype,
+                        cache_r,
+                        cache_w,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/matmul.py b/test/microbench/matmul.py
index f79606b02b..0b23362634 100644
--- a/test/microbench/matmul.py
+++ b/test/microbench/matmul.py
@@ -1,10 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
 shape_list = [
     (4, 4096, 50400),
     (4, 2048, 32000),
@@ -17,9 +16,10 @@
     (4, 2048, 50272),
     (4, 1792, 250880),
 ]
+backward = True
 
 
-def matmul(m, n, k, dtype, backward):
+def matmul(m, n, k, dtype, backward, device):
     m1 = torch.rand(2, m, k).type(dtype).to(device)
     m2 = torch.rand(k, n).type(dtype).to(device)
     if backward:
@@ -32,29 +32,64 @@ def matmul(m, n, k, dtype, backward):
         output.backward(gy)
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            matmul(shape[0], shape[2], shape[1], dtype, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        matmul(shape[0], shape[2], shape[1], dtype, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             # warm up
-            matmul(shape[0], shape[2], shape[1], dtype, backward)
+            matmul(shape[0], shape[2], shape[1], dtype, backward, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    matmul(shape[0], shape[2], shape[1], dtype, backward)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                matmul(shape[0], shape[2], shape[1], dtype, backward)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(shape, dtype, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pad_sequence.py b/test/microbench/pad_sequence.py
index a19d265a33..ab5142fdb3 100644
--- a/test/microbench/pad_sequence.py
+++ b/test/microbench/pad_sequence.py
@@ -1,70 +1,129 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = False
-num_iter = 20
 shape_list = [((25, 300), (22, 300), (15, 300)), ((2, 1000), (100, 1000), (8192, 1000))]
+backward = False
+
+
+def Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device):
+    output = torch.nn.utils.rnn.pad_sequence(([a, b, c]), batch_first, padding_value)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
+
+
+def run_profile(a, b, c, batch_first, padding_value, dtype, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(a, b, c, batch_first, padding_value, dtype, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Pad_sequence(a, b, c, batch_first, padding_value, dtype, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for batch_first in [False, True]:
+                for padding_value in [0.0, 1.0, 2.0]:
+                    a = torch.randn(shape[0], device=args.device, dtype=dtype)
+                    b = torch.randn(shape[1], device=args.device, dtype=dtype)
+                    c = torch.randn(shape[2], device=args.device, dtype=dtype)
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for batch_first in [False, True]:
-            for padding_value in [0.0, 1.0, 2.0]:
-                a = torch.randn(shape[0], device=device, dtype=dtype)
-                b = torch.randn(shape[1], device=device, dtype=dtype)
-                c = torch.randn(shape[2], device=device, dtype=dtype)
-
-                if backward:
-                    a.requires_grad_(True)
-                    b.requires_grad_(True)
-                    c.requires_grad_(True)
-
-                # warm up
-                output = torch.nn.utils.rnn.pad_sequence(
-                    ([a, b, c]), batch_first, padding_value
-                )
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-                # go
-                print(
-                    "shape:",
-                    (shape),
-                    "; datatype:",
-                    dtype,
-                    "; batch_first:",
-                    batch_first,
-                    "; padding_value:",
-                    padding_value,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        output = torch.nn.utils.rnn.pad_sequence(
-                            ([a, b, c]), batch_first, padding_value
-                        )
-                        if backward:
-                            gy = torch.empty_like(output)
-                            output.backward(gy)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    output = torch.nn.utils.rnn.pad_sequence(
-                        ([a, b, c]), batch_first, padding_value
-                    )
                     if backward:
-                        gy = torch.empty_like(output)
-                        output.backward(gy)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                        a.requires_grad_(True)
+                        b.requires_grad_(True)
+                        c.requires_grad_(True)
+
+                    # warm up
+                    Pad_sequence(
+                        a,
+                        b,
+                        c,
+                        batch_first,
+                        padding_value,
+                        dtype,
+                        backward,
+                        args.device,
+                    )
+
+                    # go
+                    print(
+                        "shape:",
+                        (shape),
+                        "; datatype:",
+                        dtype,
+                        "; batch_first:",
+                        batch_first,
+                        "; padding_value:",
+                        padding_value,
+                        "; backward:",
+                        backward,
+                    )
+                    if not args.e2e_only:
+                        run_profile(
+                            a,
+                            b,
+                            c,
+                            batch_first,
+                            padding_value,
+                            dtype,
+                            backward,
+                            args.device,
+                            args.num_iter,
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            a,
+                            b,
+                            c,
+                            batch_first,
+                            padding_value,
+                            dtype,
+                            backward,
+                            args.device,
+                            args.num_iter,
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.adaptive_max_pool2d.py b/test/microbench/pooling.adaptive_max_pool2d.py
index 5c774d76ac..35ce3610d8 100644
--- a/test/microbench/pooling.adaptive_max_pool2d.py
+++ b/test/microbench/pooling.adaptive_max_pool2d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,25 +9,26 @@
     (8, 512, 32, 32, (7, 7)),
     (8, 256, 56, 56, (14, 14)),
 ]
+backward = True
 
 
-def adaptive_mp2d(shape, dtype, channels_last, backward):
+def adaptive_mp2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, output_size = shape[0], shape[1], shape[2], shape[3], shape[4]
 
     if channels_last:
         input = (
             torch.randn(N, C, H, W)
             .to(memory_format=torch.channels_last)
-            .to(device="xpu", dtype=dtype)
+            .to(device=device, dtype=dtype)
         )
     else:
-        input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype)
+        input = torch.randn(N, C, H, W).to(device=device, dtype=dtype)
 
     if backward:
         input.requires_grad_(True)
         Wout = output_size[0]
         Hout = output_size[1]
-        grad = torch.randn([N, C, Hout, Wout]).to(device="xpu", dtype=dtype)
+        grad = torch.randn([N, C, Hout, Wout]).to(device=device, dtype=dtype)
 
     adapt_mp2d = torch.nn.AdaptiveMaxPool2d(
         output_size=(Hout, Wout), return_indices=True
@@ -38,14 +40,38 @@ def adaptive_mp2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            adaptive_mp2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        adaptive_mp2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                adaptive_mp2d(shape, dtype, channels_last, backward)
+                adaptive_mp2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -60,20 +86,44 @@ def adaptive_mp2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        adaptive_mp2d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    adaptive_mp2d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.fractional_max_pool2d.py b/test/microbench/pooling.fractional_max_pool2d.py
index b35fc0571b..d49abd640d 100644
--- a/test/microbench/pooling.fractional_max_pool2d.py
+++ b/test/microbench/pooling.fractional_max_pool2d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -9,23 +10,24 @@
     (1, 3, 1200, 1200, 600, 600),
     (512, 512, 28, 28, 14, 14),
 ]
+backward = True
 
 
-def fmp2d(shape, dtype, channels_last, backward):
+def fmp2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, oH, oW = shape[0], shape[1], shape[2], shape[3], shape[4], shape[5]
 
     if channels_last:
         input = (
             torch.randn(N, C, H, W)
             .to(memory_format=torch.channels_last)
-            .to(device="xpu", dtype=dtype)
+            .to(device=device, dtype=dtype)
         )
     else:
-        input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype)
+        input = torch.randn(N, C, H, W).to(device=device, dtype=dtype)
 
     if backward:
         input.requires_grad_(True)
-        grad = torch.randn([N, C, oH, oW]).to(device="xpu", dtype=dtype)
+        grad = torch.randn([N, C, oH, oW]).to(device=device, dtype=dtype)
 
     fmp = torch.nn.FractionalMaxPool2d(2, output_size=(oH, oW), return_indices=True)
 
@@ -35,14 +37,38 @@ def fmp2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            fmp2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        fmp2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                fmp2d(shape, dtype, channels_last, backward)
+                fmp2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -55,20 +81,44 @@ def fmp2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        fmp2d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    fmp2d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.fractional_max_pool3d.py b/test/microbench/pooling.fractional_max_pool3d.py
index 26d8921044..1721f80e9a 100644
--- a/test/microbench/pooling.fractional_max_pool3d.py
+++ b/test/microbench/pooling.fractional_max_pool3d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,9 +9,10 @@
     (1, 3, 144, 144, 144, 72, 72, 72),
     (512, 512, 12, 12, 12, 6, 6, 6),
 ]
+backward = True
 
 
-def fmp3d(shape, dtype, channels_last, backward):
+def fmp3d(shape, dtype, channels_last, backward, device):
     N, C, H, W, D, oH, oW, oD = (
         shape[0],
         shape[1],
@@ -26,14 +28,14 @@ def fmp3d(shape, dtype, channels_last, backward):
         input = (
             torch.randn(N, C, H, W, D)
             .to(memory_format=torch.channels_last_3d)
-            .to(device="xpu", dtype=dtype)
+            .to(device=device, dtype=dtype)
         )
     else:
-        input = torch.randn(N, C, H, W, D).to(device="xpu", dtype=dtype)
+        input = torch.randn(N, C, H, W, D).to(device=device, dtype=dtype)
 
     if backward:
         input.requires_grad_(True)
-        grad = torch.randn([N, C, oH, oW, oD]).to(device="xpu", dtype=dtype)
+        grad = torch.randn([N, C, oH, oW, oD]).to(device=device, dtype=dtype)
 
     fmp = torch.nn.FractionalMaxPool3d(2, output_size=(oH, oW, oD), return_indices=True)
 
@@ -43,14 +45,38 @@ def fmp3d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            fmp3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        fmp3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                fmp3d(shape, dtype, channels_last, backward)
+                fmp3d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -63,20 +89,44 @@ def fmp3d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        fmp3d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    fmp3d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.max_pool2d.py b/test/microbench/pooling.max_pool2d.py
index 37e88646ab..540bb64168 100644
--- a/test/microbench/pooling.max_pool2d.py
+++ b/test/microbench/pooling.max_pool2d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -9,9 +10,10 @@
     (64, 1024, 112, 112, (6), (4)),
     (16, 2048, 224, 224, (3), (2)),
 ]
+backward = True
 
 
-def mp2d(shape, dtype, channels_last, backward):
+def mp2d(shape, dtype, channels_last, backward, device):
     N, C, H, W, kernel_size, stride = (
         shape[0],
         shape[1],
@@ -25,10 +27,10 @@ def mp2d(shape, dtype, channels_last, backward):
         input = (
             torch.randn(N, C, H, W)
             .to(memory_format=torch.channels_last)
-            .to(device="xpu", dtype=dtype)
+            .to(device=device, dtype=dtype)
         )
     else:
-        input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype)
+        input = torch.randn(N, C, H, W).to(device=device, dtype=dtype)
 
     if backward:
         input.requires_grad_(True)
@@ -38,7 +40,7 @@ def mp2d(shape, dtype, channels_last, backward):
         else:
             Wout = (W - kernel_size[1]) / stride[1] + 1
             Hout = (H - kernel_size[0]) / stride[0] + 1
-        grad = torch.randn([N, C, int(Hout), int(Wout)]).to(device="xpu", dtype=dtype)
+        grad = torch.randn([N, C, int(Hout), int(Wout)]).to(device=device, dtype=dtype)
 
     mp2d = torch.nn.MaxPool2d(shape[4], stride=shape[5], return_indices=True)
 
@@ -48,14 +50,38 @@ def mp2d(shape, dtype, channels_last, backward):
         output[0].backward(grad)
 
 
-if __name__ == "__main__":
-    backward = True
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            mp2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        mp2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                mp2d(shape, dtype, channels_last, backward)
+                mp2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -72,20 +98,44 @@ def mp2d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        mp2d(shape, dtype, channels_last, backward)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    mp2d(shape, dtype, channels_last, backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.max_pool3d.py b/test/microbench/pooling.max_pool3d.py
index 0563287a7d..2b488d391a 100644
--- a/test/microbench/pooling.max_pool3d.py
+++ b/test/microbench/pooling.max_pool3d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,10 +9,10 @@
     (1, 4, 144, 144, 144, 72, 72, 72),
     (512, 512, 12, 12, 12, 6, 6, 6),
 ]
-num_iter = 20
+backward = True
 
 
-def fmp3d(shape, dtype, channels_last, backward):
+def mp3d(shape, dtype, channels_last, backward, device):
     torch.manual_seed(20)
     N, C, H, W, D, oH, oW, oD = (
         shape[0],
@@ -28,59 +29,101 @@ def fmp3d(shape, dtype, channels_last, backward):
         input = (
             torch.randn(N, C, H, W, D)
             .to(memory_format=torch.channels_last_3d)
-            .to(device="xpu", dtype=dtype)
+            .to(device=device, dtype=dtype)
         )
     else:
-        input = torch.randn(N, C, H, W, D).to(device="xpu", dtype=dtype)
+        input = torch.randn(N, C, H, W, D).to(device=device, dtype=dtype)
 
     if backward:
         input.requires_grad_(True)
-        grad = torch.randn([N, C, oH, oW, oD]).to(device="xpu", dtype=dtype)
+        grad = torch.randn([N, C, oH, oW, oD]).to(device=device, dtype=dtype)
 
     fmp = torch.nn.MaxPool3d(2, return_indices=True)
     output = fmp(input)
 
-    # warm up
-    output = fmp(input)
-    if backward:
-        output[0].backward(grad)
-
-    # go
-    print(
-        "shape:",
-        (shape[0], shape[1], shape[2], shape[3], shape[4]),
-        "; datatype:",
-        dtype,
-        "; channels_last:",
-        channels_last,
-        "; backward:",
-        backward,
-    )
+
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
     ) as prof:
         for i in range(num_iter):
-            output = fmp(input)
-            if backward:
-                output[0].backward(grad)
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+            mp3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
 
-    # E2E time
-    torch.xpu.synchronize()
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
-        output = fmp(input)
-        if backward:
-            output[0].backward(grad)
-    torch.xpu.synchronize()
+        mp3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
 
-if __name__ == "__main__":
-    backward = True
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
-                fmp3d(shape, dtype, channels_last, backward=True)
+                # warm up
+                mp3d(shape, dtype, channels_last, backward, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape[0], shape[1], shape[2], shape[3], shape[4]),
+                    "; datatype:",
+                    dtype,
+                    "; channels_last:",
+                    channels_last,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.max_unpool2d.py b/test/microbench/pooling.max_unpool2d.py
index 9d0c3388e5..9c48a8c8f7 100644
--- a/test/microbench/pooling.max_unpool2d.py
+++ b/test/microbench/pooling.max_unpool2d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,9 +9,10 @@
     (4, 65, 128, 128),
     (8, 128, 128, 128),
 ]
+backward = True
 
 
-def maxUnpool2d(shape, dtype, device, channels_last, backward):
+def maxUnpool2d(shape, dtype, channels_last, backward, device):
     N, C, H, W = int(shape[0]), int(shape[1]), int(shape[2]), int(shape[3])
     kernel_size = 2
 
@@ -58,15 +60,38 @@ def maxUnpool2d(shape, dtype, device, channels_last, backward):
         y_dpcpp.backward(grad_dpcpp)
 
 
-if __name__ == "__main__":
-    backward = True
-    device = "xpu"
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            maxUnpool2d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        maxUnpool2d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                maxUnpool2d(shape, dtype, device, channels_last, backward=backward)
+                maxUnpool2d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -81,22 +106,44 @@ def maxUnpool2d(shape, dtype, device, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        maxUnpool2d(
-                            shape, dtype, device, channels_last, backward=backward
-                        )
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    maxUnpool2d(shape, dtype, device, channels_last, backward=backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/pooling.max_unpool3d.py b/test/microbench/pooling.max_unpool3d.py
index 66610eaaf7..91d87d15c1 100644
--- a/test/microbench/pooling.max_unpool3d.py
+++ b/test/microbench/pooling.max_unpool3d.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -8,9 +9,10 @@
     (4, 33, 64, 64, 64),
     (16, 32, 32, 32, 32),
 ]
+backward = True
 
 
-def maxUnpool3d(shape, dtype, device, channels_last, backward):
+def maxUnpool3d(shape, dtype, channels_last, backward, device):
     N, C, D, H, W = (
         int(shape[0]),
         int(shape[1]),
@@ -64,15 +66,38 @@ def maxUnpool3d(shape, dtype, device, channels_last, backward):
         y_dpcpp.backward(grad_dpcpp)
 
 
-if __name__ == "__main__":
-    backward = True
-    device = "xpu"
-    num_iter = 20
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            maxUnpool3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        maxUnpool3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                maxUnpool3d(shape, dtype, device, channels_last, backward=backward)
+                maxUnpool3d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -87,22 +112,44 @@ def maxUnpool3d(shape, dtype, device, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        maxUnpool3d(
-                            shape, dtype, device, channels_last, backward=backward
-                        )
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    maxUnpool3d(shape, dtype, device, channels_last, backward=backward)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/reduce.max.py b/test/microbench/reduce.max.py
index cf03fe413a..d20b4d3c09 100644
--- a/test/microbench/reduce.max.py
+++ b/test/microbench/reduce.max.py
@@ -1,49 +1,89 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
+
+
+def Max(input, dim, backward, device):
+    output = torch.max(input, dim)
+
+
+def run_profile(input, dim, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Max(input, dim, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, dim, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Max(input, dim, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
 
 # dim = 1: reduce along contiguous dim
 # dim = 0: reduce along strided dim
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [1, 0]:
-            input = torch.randn(8192, 8192, dtype=dtype, device=device)
-
-            # warm up
-            output = torch.max(input, 1)
-            output = torch.max(input, 0)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    output = torch.max(input, dim)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                output = torch.max(input, dim)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [1, 0]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+
+                # warm up
+                Max(input, dim, backward, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, dim, backward, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, dim, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/reduce.sum.py b/test/microbench/reduce.sum.py
index de092c4ee8..32b8d3eda2 100644
--- a/test/microbench/reduce.sum.py
+++ b/test/microbench/reduce.sum.py
@@ -1,57 +1,93 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
 shape_list = [(8192, 8192)]
 backward = False
-num_iter = 20
+
+
+def Sum(input, dim, backward, device):
+    if dim is None:
+        output = torch.sum(input)
+    else:
+        output = torch.sum(input, dim)
+
+
+def run_profile(input, dim, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Sum(input, dim, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, dim, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Sum(input, dim, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
 
 # dim = None: reduce all
 # dim = 0: reduce along strided dim
 # dim = 1: reduce along contiguous dim
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [None, 0, 1]:
-            input = torch.randn(shape, dtype=dtype, device=device)
-
-            # warm up
-            output = torch.sum(input)
-            output = torch.sum(input, 0)
-            output = torch.sum(input, 1)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    if dim is None:
-                        output = torch.sum(input)
-                    else:
-                        output = torch.sum(input, dim)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                if dim is None:
-                    output = torch.sum(input)
-                else:
-                    output = torch.sum(input, dim)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [None, 0, 1]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+
+                # warm up
+                Sum(input, dim, backward, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, dim, backward, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, dim, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/remainder.py b/test/microbench/remainder.py
index 3a0c79ce32..09b4e9026a 100644
--- a/test/microbench/remainder.py
+++ b/test/microbench/remainder.py
@@ -1,58 +1,92 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(1024, 1024, 1024), (6, 7, 3, 2), (8193, 8193, 4, 4)]
 backward = True
-num_iter = 20
 
-shape_list = [(1024, 1024, 1024), (6, 7, 3, 2), (8193, 8193, 4, 4)]
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for divisor in [2, -1.5, 3]:
-            input = torch.randn(shape, device=device, dtype=dtype)
-            if backward:
-                input.requires_grad_(True)
-
-            # warm
-            output = torch.remainder(input, divisor)
-            if backward:
-                gy = torch.empty_like(output)
-                output.backward(gy)
-
-            # go
-            print(
-                "shape:",
-                shape[0],
-                "; datatype:",
-                dtype,
-                "; divisor:",
-                divisor,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    output = torch.remainder(input, divisor)
-                    if backward:
-                        gy = torch.empty_like(output)
-                        output.backward(gy)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                output = torch.remainder(input, divisor)
+def Remainder(input, divisor, backward, device):
+    output = torch.remainder(input, divisor)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
+
+
+def run_profile(input, divisor, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Remainder(input, divisor, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, divisor, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Remainder(input, divisor, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for divisor in [2, -1.5, 3]:
+                input = torch.randn(shape, device=args.device, dtype=dtype)
                 if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                    input.requires_grad_(True)
+
+                # warm
+                Remainder(input, divisor, backward, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    shape[0],
+                    "; datatype:",
+                    dtype,
+                    "; divisor:",
+                    divisor,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, divisor, backward, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, divisor, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/repeat_interleave.py b/test/microbench/repeat_interleave.py
index b0a31207ef..dc228b93c6 100644
--- a/test/microbench/repeat_interleave.py
+++ b/test/microbench/repeat_interleave.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -7,58 +8,94 @@
     (16, 8, 23),
     (4, 2048, 2048),
 ]
-device = "xpu"
 backward = False
-num_iter = 20
 
-for shape in shape_list:
-    for repeats in [8]:
-        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-            for dim in [0, 2]:
-                input = torch.randn(shape, device=device, dtype=dtype)
 
-                if backward:
-                    input.requires_grad_(True)
+def Repeat_interleave(input, repeats, dim, backward, device):
+    output = torch.repeat_interleave(input, repeats, dim)
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
 
-                # warm up
-                for i in range(5):
-                    output = torch.repeat_interleave(input, repeats, dim)
+
+def run_profile(input, repeats, dim, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Repeat_interleave(input, repeats, dim, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, repeats, dim, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Repeat_interleave(input, repeats, dim, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for repeats in [8]:
+            for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+                for dim in [0, 2]:
+                    input = torch.randn(shape, device=args.device, dtype=dtype)
 
                     if backward:
-                        gy = torch.empty_like(output)
-                        output.backward(gy)
-                # go
-                print(
-                    "shape:",
-                    shape,
-                    "; datatype:",
-                    dtype,
-                    "; dim:",
-                    dim,
-                    "; backward:",
-                    backward,
-                )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        output = torch.repeat_interleave(input, repeats, dim)
-
-                        if backward:
-                            gy = torch.empty_like(output)
-                            output.backward(gy)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    output = torch.repeat_interleave(input, repeats, dim)
-                    if backward:
-                        gy = torch.empty_like(output)
-                        output.backward(gy)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                        input.requires_grad_(True)
+
+                    # warm up
+                    Repeat_interleave(input, repeats, dim, backward, args.device)
+
+                    # go
+                    print(
+                        "shape:",
+                        shape,
+                        "; datatype:",
+                        dtype,
+                        "; dim:",
+                        dim,
+                        "; backward:",
+                        backward,
+                    )
+                    if not args.e2e_only:
+                        run_profile(
+                            input, repeats, dim, backward, args.device, args.num_iter
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            input, repeats, dim, backward, args.device, args.num_iter
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/roll.py b/test/microbench/roll.py
index 7680d7e784..28c3e98a07 100644
--- a/test/microbench/roll.py
+++ b/test/microbench/roll.py
@@ -1,12 +1,9 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
-
 shape_list = [
     ((1024, 1024, 1024), (-1), (0)),
     ((1024, 1024, 1024), (128, 128), (-1, 0)),
@@ -15,51 +12,89 @@
     ((16, 3, 512, 512), (127), (0)),
     ((16, 3, 512, 512), (127, 127), (0, -1)),
 ]
+backward = True
+
+
+def Roll(input, shape, backward, device):
+    output = torch.roll(input, shifts=shape[1], dims=shape[2])
+    if backward:
+        gy = torch.empty_like(output)
+        output.backward(gy)
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape[0], device=device, dtype=dtype)
-        if backward:
-            input.requires_grad_(True)
-
-        # warm
-        output = torch.roll(input, shifts=shape[1], dims=shape[2])
-        if backward:
-            gy = torch.empty_like(output)
-            output.backward(gy)
-
-        # go
-        print(
-            "shape:",
-            shape[0],
-            "; datatype:",
-            dtype,
-            "; dim:",
-            shape[2],
-            "; shifts:",
-            shape[1],
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                output = torch.roll(input, shifts=shape[1], dims=shape[2])
-                if backward:
-                    gy = torch.empty_like(output)
-                    output.backward(gy)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+def run_profile(input, shape, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = torch.roll(input, shifts=shape[1], dims=shape[2])
+            Roll(input, shape, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, shape, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Roll(input, shape, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], device=args.device, dtype=dtype)
             if backward:
-                gy = torch.empty_like(output)
-                output.backward(gy)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+                input.requires_grad_(True)
+
+            # warm
+            Roll(input, shape, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                shape[0],
+                "; datatype:",
+                dtype,
+                "; dim:",
+                shape[2],
+                "; shifts:",
+                shape[1],
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(input, shape, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, shape, backward, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scan.cumsum.py b/test/microbench/scan.cumsum.py
index 944a402a64..7e1699b813 100644
--- a/test/microbench/scan.cumsum.py
+++ b/test/microbench/scan.cumsum.py
@@ -1,48 +1,87 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(8193, 8193), (1234, 8193), (8192, 1234), (1, 4 * 15000)]
 backward = False
-num_iter = 20
 
-shape_list = [(8193, 8193), (1234, 8193), (8192, 1234), (1, 4 * 15000)]
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [0, 1]:
-            input = torch.randn(shape, dtype=dtype, device=device)
-
-            # warm up
-            torch.cumsum(input, 0)
-            torch.cumsum(input, 1)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    torch.cumsum(input, 0)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                torch.cumsum(input, 0)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+def Cumsum(input, dim, device):
+    torch.cumsum(input, dim)
+
+
+def run_profile(input, dim, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Cumsum(input, dim, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, dim, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Cumsum(input, dim, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [0, 1]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+
+                # warm up
+                Cumsum(input, dim, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, dim, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, dim, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scan.masked_select.py b/test/microbench/scan.masked_select.py
index 6ed1b9cf63..fe46879679 100644
--- a/test/microbench/scan.masked_select.py
+++ b/test/microbench/scan.masked_select.py
@@ -1,36 +1,77 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(8193, 8193)]
-device = "xpu"
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randn(shape, dtype=dtype, device=device)
-        mask = input.ge(0.5)
-        # warm up
-        torch.masked_select(input, mask)
-
-        # go
-        print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                torch.masked_select(input, mask)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Masked_select(input, mask, device):
+    torch.masked_select(input, mask)
+
+
+def run_profile(input, mask, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            torch.masked_select(input, mask)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Masked_select(input, mask, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, mask, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Masked_select(input, mask, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
+            mask = input.ge(0.5)
+            # warm up
+            Masked_select(input, mask, args.device)
+
+            # go
+            print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, mask, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, mask, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scan.nonzero.py b/test/microbench/scan.nonzero.py
index e2f6afbdad..a98d4b3171 100644
--- a/test/microbench/scan.nonzero.py
+++ b/test/microbench/scan.nonzero.py
@@ -1,39 +1,80 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
 shape_list = [(2047, 2047, 10), (1, 4 * 15000)]
-device = "xpu"
 backward = False
-num_iter = 20
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        if shape == (2047, 2047, 10):
-            input = torch.randint(-2, 3, shape, dtype=dtype, device=device)
-        else:
-            input = torch.randn(shape, dtype=dtype, device=device)
-
-        # warm up
-        torch.nonzero(input)
-
-        # go
-        print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                torch.nonzero(input)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Nonzero(input, device):
+    torch.nonzero(input)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            torch.nonzero(input)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Nonzero(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Nonzero(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            if shape == (2047, 2047, 10):
+                input = torch.randint(-2, 3, shape, dtype=dtype, device=args.device)
+            else:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+
+            # warm up
+            Nonzero(input, args.device)
+
+            # go
+            print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scan.topk.py b/test/microbench/scan.topk.py
index 32e0ee6186..a25270d43d 100644
--- a/test/microbench/scan.topk.py
+++ b/test/microbench/scan.topk.py
@@ -1,57 +1,94 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = False
-num_iter = 20
-
 shape_list = [(8193, 8193)]
+backward = False
 k = 4096
 largest = True
 sorted = True
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [None, 0, 1]:
-            input = torch.randn(shape, dtype=dtype, device=device)
-            # warm up
-            torch.topk(input, k)
-            torch.topk(input, k, 0, largest, sorted)
-            torch.topk(input, k, 1, largest, sorted)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    if dim is None:
-                        torch.topk(input, k)
-                    else:
-                        torch.topk(input, k, dim, largest, sorted)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                if dim is None:
-                    torch.topk(input, k)
-                else:
-                    torch.topk(input, k, dim, largest, sorted)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+
+def Topk(input, dim, k, largest, sorted, device):
+    if dim is None:
+        torch.topk(input, k)
+    else:
+        torch.topk(input, k, dim, largest, sorted)
+
+
+def run_profile(input, dim, k, largest, sorted, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Topk(input, dim, k, largest, sorted, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, dim, k, largest, sorted, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Topk(input, dim, k, largest, sorted, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [None, 0, 1]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+                # warm up
+                Topk(input, dim, k, largest, sorted, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(
+                        input, dim, k, largest, sorted, args.device, args.num_iter
+                    )
+
+                if not args.profile_only:
+                    run_e2e(input, dim, k, largest, sorted, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scan.unique.py b/test/microbench/scan.unique.py
index 5b17d7b16e..1a44e36e16 100644
--- a/test/microbench/scan.unique.py
+++ b/test/microbench/scan.unique.py
@@ -1,40 +1,77 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(2049, 2049)]
 backward = False
-num_iter = 20
 
-shape_list = [(2049, 2049)]
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        input = torch.randint(100, shape, dtype=dtype, device=device)
-
-        # warm up
-        torch.unique(input, sorted=True, return_inverse=True, return_counts=True)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                output = torch.unique(
-                    input, sorted=True, return_inverse=True, return_counts=True
-                )
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+def Unique(input, device):
+    torch.unique(input, sorted=True, return_inverse=True, return_counts=True)
+
+
+def run_profile(input, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            output = torch.unique(
-                input, sorted=True, return_inverse=True, return_counts=True
-            )
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Unique(input, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Unique(input, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randint(100, shape, dtype=dtype, device=args.device)
+
+            # warm up
+            Unique(input, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(input, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(input, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scatter_gather.gather.py b/test/microbench/scatter_gather.gather.py
index aa2279a800..a1bfc690f3 100644
--- a/test/microbench/scatter_gather.gather.py
+++ b/test/microbench/scatter_gather.gather.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -15,47 +16,93 @@
     ((10240, 8192), (2560, 8192), 0),
     ((8192, 10240), (2048, 10240), 0),
 ]
-
-device = "xpu"
 backward = False
-num_iter = 20
-
-g_xpu = torch.Generator(device=device)
-g_xpu.manual_seed(25)
-torch.manual_seed(25)
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        shapes = shape[0]
-        ishapes = shape[1]
-        dim = shape[2]
-        a = torch.randn(shapes, dtype=dtype, device=device)
-        index = torch.randint(1, shapes[dim], ishapes, device=device, generator=g_xpu)
-        print(
-            "shape:",
-            shapes,
-            "; kernel_size:",
-            ishapes,
-            "; datatype:",
-            dtype,
-            "; dim:",
-            dim,
-            "; backward:",
-            backward,
-        )
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-            record_shapes=True,
-        ) as prof:
-            for i in range(num_iter):
-                torch.gather(a, dim, index)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+
+
+def Gather(a, dim, index, device):
+    torch.gather(a, dim, index)
+
+
+def run_profile(a, dim, index, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            torch.gather(a, dim, index)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Gather(a, dim, index, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(a, dim, index, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Gather(a, dim, index, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            shapes = shape[0]
+            ishapes = shape[1]
+            dim = shape[2]
+            g_xpu = torch.Generator(device=args.device)
+            g_xpu.manual_seed(25)
+            torch.manual_seed(25)
+            a = torch.randn(shapes, dtype=dtype, device=args.device)
+            index = torch.randint(
+                1, shapes[dim], ishapes, device=args.device, generator=g_xpu
+            )
+
+            # warm up
+            Gather(a, dim, index, args.device)
+
+            # go
+            print(
+                "shape:",
+                shapes,
+                "; kernel_size:",
+                ishapes,
+                "; datatype:",
+                dtype,
+                "; dim:",
+                dim,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(a, dim, index, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(a, dim, index, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scatter_gather.scatter.py b/test/microbench/scatter_gather.scatter.py
index 7c5f105d7a..707a390a3a 100644
--- a/test/microbench/scatter_gather.scatter.py
+++ b/test/microbench/scatter_gather.scatter.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -19,17 +20,10 @@
     ((4096, 8192, 8192), 1),
     ((4097, 8193, 8193), 1),
 ]
-
-device = "xpu"
 backward = False
-num_iter = 20
-
-g_xpu = torch.Generator(device=device)
-g_xpu.manual_seed(25)
-torch.manual_seed(25)
 
 
-def Scatter(shape, dtype, dim, device):
+def Scatter(shape, dtype, dim, g_xpu, device):
     if dim == 2:
         m, n, k1, k2 = shape[0][0], shape[0][1], shape[0][2], shape[0][3]
         src = torch.ones((m, n, k1), dtype=dtype, device=device)
@@ -50,12 +44,41 @@ def Scatter(shape, dtype, dim, device):
     dst = zeros.scatter_(dim, index, src)
 
 
-if __name__ == "__main__":
+def run_profile(shape, dtype, dim, g_xpu, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Scatter(shape, dtype, dim, g_xpu, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, dim, g_xpu, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Scatter(shape, dtype, dim, g_xpu, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             dim = shape[1]
+            g_xpu = torch.Generator(device=args.device)
+            g_xpu.manual_seed(25)
+            torch.manual_seed(25)
             # warm up
-            Scatter(shape, dtype, dim, device)
+            Scatter(shape, dtype, dim, g_xpu, args.device)
 
             # go
             print(
@@ -68,20 +91,30 @@ def Scatter(shape, dtype, dim, device):
                 "; backward:",
                 backward,
             )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    Scatter(shape, dtype, dim, device)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                Scatter(shape, dtype, dim, device)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(shape, dtype, dim, g_xpu, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(shape, dtype, dim, g_xpu, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/scatter_gather.scatter_add.py b/test/microbench/scatter_gather.scatter_add.py
index 97835bc093..0e73f86594 100644
--- a/test/microbench/scatter_gather.scatter_add.py
+++ b/test/microbench/scatter_gather.scatter_add.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -20,17 +21,10 @@
     ((4096, 8192, 8192), 1),
     ((4097, 8193, 8193), 1),
 ]
-
-device = "xpu"
 backward = False
-num_iter = 20
-
-g_xpu = torch.Generator(device=device)
-g_xpu.manual_seed(25)
-torch.manual_seed(25)
 
 
-def Scatter_add(shape, dtype, dim, device):
+def Scatter_add(shape, dtype, dim, g_xpu, device):
     if dim == 2:
         m, n, k1, k2 = shape[0][0], shape[0][1], shape[0][2], shape[0][3]
         src = torch.ones((m, n, k1), dtype=dtype, device=device)
@@ -51,12 +45,41 @@ def Scatter_add(shape, dtype, dim, device):
     dst = zeros.scatter_add_(dim, index, src)
 
 
-if __name__ == "__main__":
+def run_profile(shape, dtype, dim, g_xpu, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Scatter_add(shape, dtype, dim, g_xpu, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, dim, g_xpu, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Scatter_add(shape, dtype, dim, g_xpu, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             dim = shape[1]
+            g_xpu = torch.Generator(device=args.device)
+            g_xpu.manual_seed(25)
+            torch.manual_seed(25)
             # warm up
-            Scatter_add(shape, dtype, dim, device)
+            Scatter_add(shape, dtype, dim, g_xpu, args.device)
 
             # go
             print(
@@ -69,20 +92,30 @@ def Scatter_add(shape, dtype, dim, device):
                 "; backward:",
                 backward,
             )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    Scatter_add(shape, dtype, dim, device)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                Scatter_add(shape, dtype, dim, device)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+            if not args.e2e_only:
+                run_profile(shape, dtype, dim, g_xpu, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(shape, dtype, dim, g_xpu, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/softmax.py b/test/microbench/softmax.py
index 08d598e839..4927e1ce25 100644
--- a/test/microbench/softmax.py
+++ b/test/microbench/softmax.py
@@ -1,56 +1,94 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(8192, 8192), (64, 8192), (8192, 64)]
 backward = True
-num_iter = 20
 
-shape_list = [(8192, 8192), (64, 8192), (8192, 64)]
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [0, 1]:
-            H, W = (int(shape[0]), int(shape[1]))
-            input = torch.randn((H, W)).to(dtype=dtype, device=device)
-
-            softmax = torch.nn.Softmax(dim=dim)
-            softmax.to(device=device, dtype=dtype)
-            grad_dpcpp = torch.randn((H, W)).to(device=device, dtype=dtype)
-            input.requires_grad_(True)
-
-            # warm up
-            output = softmax(input)
-            output.backward(grad_dpcpp)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    output = softmax(input)
-                    output.backward(grad_dpcpp)
-            print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=100))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                output = softmax(input)
-                output.backward(grad_dpcpp)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+def Softmax(input, softmax, grad_dpcpp, device):
+    output = softmax(input)
+    output.backward(grad_dpcpp)
+
+
+def run_profile(input, softmax, grad_dpcpp, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Softmax(input, softmax, grad_dpcpp, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, softmax, grad_dpcpp, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Softmax(input, softmax, grad_dpcpp, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [0, 1]:
+                H, W = (int(shape[0]), int(shape[1]))
+                input = torch.randn((H, W)).to(dtype=dtype, device=args.device)
+
+                softmax = torch.nn.Softmax(dim=dim)
+                softmax.to(device=args.device, dtype=dtype)
+                grad_dpcpp = torch.randn((H, W)).to(device=args.device, dtype=dtype)
+                input.requires_grad_(True)
+
+                # warm up
+                Softmax(input, softmax, grad_dpcpp, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, softmax, grad_dpcpp, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, softmax, grad_dpcpp, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/sort.py b/test/microbench/sort.py
index 4c09d627bb..f2fba81257 100644
--- a/test/microbench/sort.py
+++ b/test/microbench/sort.py
@@ -1,55 +1,90 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(8193, 8193)]
 backward = False
-num_iter = 20
 
-shape_list = [(8193, 8193)]
 
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for dim in [None, 0, 1]:
-            input = torch.randn(shape, dtype=dtype, device=device)
-
-            # warm up
-            torch.sort(input)
-            torch.sort(input, 0)
-            torch.sort(input, 1)
-
-            # go
-            print(
-                "shape:",
-                (shape),
-                "; datatype:",
-                dtype,
-                "; dim:",
-                dim,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    if dim is None:
-                        torch.sort(input)
-                    else:
-                        torch.sort(input, dim)
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                if dim is None:
-                    torch.sort(input)
-                else:
-                    torch.sort(input, dim)
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+def Sort(input, dim, device):
+    if dim is None:
+        torch.sort(input)
+    else:
+        torch.sort(input, dim)
+
+
+def run_profile(input, dim, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Sort(input, dim, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(input, dim, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Sort(input, dim, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            for dim in [None, 0, 1]:
+                input = torch.randn(shape, dtype=dtype, device=args.device)
+
+                # warm up
+                Sort(input, dim, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (shape),
+                    "; datatype:",
+                    dtype,
+                    "; dim:",
+                    dim,
+                    "; backward:",
+                    backward,
+                )
+                if not args.e2e_only:
+                    run_profile(input, dim, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(input, dim, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/sort.randperm.py b/test/microbench/sort.randperm.py
index 0e8ef1f830..62b986ee79 100644
--- a/test/microbench/sort.randperm.py
+++ b/test/microbench/sort.randperm.py
@@ -1,34 +1,75 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [(8193)]
 backward = False
-num_iter = 20
 
-shape_list = [(8193)]
 
-for shape in shape_list:
-    for dtype in [torch.float32]:
-        # warm up
-        torch.randperm(shape, dtype=dtype, device=device)
-
-        # go
-        print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
-        ) as prof:
-            for i in range(num_iter):
-                torch.randperm(shape, dtype=dtype, device=device)
-        print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-        # E2E time
-        torch.xpu.synchronize()
-        t1 = time.time()
+def Randperm(shape, dtype, device):
+    torch.randperm(shape, dtype=dtype, device=device)
+
+
+def run_profile(shape, dtype, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
         for i in range(num_iter):
-            torch.randperm(shape, dtype=dtype, device=device)
-        torch.xpu.synchronize()
-        t2 = time.time()
-        e2e_time = (t2 - t1) / num_iter
-        print("E2E total time:", f"{float(e2e_time):.20f}")
+            Randperm(shape, dtype, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Randperm(shape, dtype, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.float32]:
+            # warm up
+            Randperm(shape, dtype, args.device)
+
+            # go
+            print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
+            if not args.e2e_only:
+                run_profile(shape, dtype, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(shape, dtype, args.device, args.num_iter)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/upsample_bicubic2d.py b/test/microbench/upsample_bicubic2d.py
index 4a985e6f05..557684cd03 100644
--- a/test/microbench/upsample_bicubic2d.py
+++ b/test/microbench/upsample_bicubic2d.py
@@ -1,85 +1,123 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [
+    [1, 3, 1200, 1200],
+    [1, 128, 1200, 1200],
+    [1, 3, 1200, 1200],
+    [128, 128, 5, 5],
+]
+scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7]]
 backward = True
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device="xpu")
-cache_w = torch.randn((1024 * 1024 * 1024), device="xpu")
 
 
-def simple_test(in_shape, scale_factor, backward, dtype):
-    in_tensor = torch.randn(
-        in_shape, dtype=dtype, device=device, requires_grad=backward
-    )
+def Bicubic2d(in_tensor, scale, backward, device):
     output = torch.nn.functional.interpolate(
-        in_tensor, mode="bicubic", scale_factor=scale_factor, align_corners=True
+        in_tensor,
+        mode="bicubic",
+        scale_factor=scale,
+        align_corners=True,
     )
-
-    # warm_up
-    for _ in range(10):
-        output = torch.nn.functional.interpolate(
-            in_tensor, mode="bicubic", scale_factor=scale_factor, align_corners=True
+    if backward:
+        output = torch.autograd.grad(
+            output, in_tensor, grad_outputs=torch.ones_like(output)
         )
 
-    # go
-    print(
-        "shape:",
-        (in_shape),
-        "; datatype:",
-        dtype,
-        "; scale_factor:",
-        scale_factor,
-        "; backward:",
-        backward,
-    )
+
+def run_profile(in_tensor, scale, backward, cache_r, cache_w, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
     ) as prof:
-        for i in range(num_iter):
+        for _ in range(num_iter):
             cache_r = cache_w + 1
-            output = torch.nn.functional.interpolate(
-                in_tensor,
-                mode="bicubic",
-                scale_factor=scale_factor,
-                align_corners=True,
-            )
-            if backward:
-                output = torch.autograd.grad(
-                    output, in_tensor, grad_outputs=torch.ones_like(output)
-                )
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+            Bicubic2d(in_tensor, scale, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
-    # E2E time
-    torch.xpu.synchronize()
+def run_e2e(in_tensor, scale, backward, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
-    for i in range(num_iter):
+    for _ in range(num_iter):
         cache_r = cache_w + 1
-        output = torch.nn.functional.interpolate(
-            in_tensor,
-            mode="bicubic",
-            scale_factor=scale_factor,
-            align_corners=True,
-        )
-        if backward:
-            output = torch.autograd.grad(
-                output, in_tensor, grad_outputs=torch.ones_like(output)
-            )
-    torch.xpu.synchronize()
+        Bicubic2d(in_tensor, scale, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
 
-shape_list = [
-    [1, 3, 1200, 1200],
-    [1, 128, 1200, 1200],
-    [1, 3, 1200, 1200],
-    [128, 128, 5, 5],
-]
-scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7]]
-for sp, sf in zip(shape_list, scale_factor):
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        simple_test(sp, sf, backward, dtype)
+def benchmark(args):
+    for in_shape, scale in zip(shape_list, scale_factor):
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            in_tensor = torch.randn(
+                in_shape, dtype=dtype, device=args.device, requires_grad=backward
+            )
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+
+            # warm_up
+            Bicubic2d(in_tensor, scale, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                (in_shape),
+                "; datatype:",
+                dtype,
+                "; scale_factor:",
+                scale,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(
+                    in_tensor,
+                    scale,
+                    backward,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    in_tensor,
+                    scale,
+                    backward,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/upsample_bilinear2d.py b/test/microbench/upsample_bilinear2d.py
index fc841edd27..48e18e75d0 100644
--- a/test/microbench/upsample_bilinear2d.py
+++ b/test/microbench/upsample_bilinear2d.py
@@ -1,87 +1,124 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
+shape_list = [
+    [1, 3, 1200, 1200],
+    [1, 128, 1200, 1200],
+    [1, 3, 1200, 1200],
+    [128, 128, 5, 5],
+    [8, 32, 256, 256],
+]
+scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7], 3]
 backward = True
-num_iter = 20
-cache_r = torch.randn((1024 * 1024 * 1024), device="xpu")
-cache_w = torch.randn((1024 * 1024 * 1024), device="xpu")
 
 
-def simple_test(in_shape, scale_factor, backward, dtype, mode):
-    in_tensor = torch.randn(
-        in_shape, dtype=dtype, device=device, requires_grad=backward
-    )
+def Bilinear2d(in_tensor, scale, backward, device):
     output = torch.nn.functional.interpolate(
-        in_tensor, mode=mode, scale_factor=scale_factor
+        in_tensor,
+        mode="bilinear",
+        scale_factor=scale,
     )
-
-    # warm_up
-    for _ in range(10):
-        output = torch.nn.functional.interpolate(
-            in_tensor, mode=mode, scale_factor=scale_factor
+    if backward:
+        output = torch.autograd.grad(
+            output, in_tensor, grad_outputs=torch.ones_like(output)
         )
 
-    # go
-    print(
-        "shape:",
-        (in_shape),
-        "; datatype:",
-        dtype,
-        "; scale_factor:",
-        scale_factor,
-        "; mode:",
-        mode,
-        "; backward:",
-        backward,
-    )
+
+def run_profile(in_tensor, scale, backward, cache_r, cache_w, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
     ) as prof:
-        for i in range(num_iter):
+        for _ in range(num_iter):
             cache_r = cache_w + 1
-            output = torch.nn.functional.interpolate(
-                in_tensor,
-                mode=mode,
-                scale_factor=scale_factor,
-            )
-            if backward:
-                output = torch.autograd.grad(
-                    output, in_tensor, grad_outputs=torch.ones_like(output)
-                )
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+            Bilinear2d(in_tensor, scale, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
-    # E2E time
-    torch.xpu.synchronize()
+def run_e2e(in_tensor, scale, backward, cache_r, cache_w, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
-    for i in range(num_iter):
+    for _ in range(num_iter):
         cache_r = cache_w + 1
-        output = torch.nn.functional.interpolate(
-            in_tensor,
-            mode=mode,
-            scale_factor=scale_factor,
-        )
-        if backward:
-            output = torch.autograd.grad(
-                output, in_tensor, grad_outputs=torch.ones_like(output)
-            )
-    torch.xpu.synchronize()
+        Bilinear2d(in_tensor, scale, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
 
-shape_list = [
-    [1, 3, 1200, 1200],
-    [1, 128, 1200, 1200],
-    [1, 3, 1200, 1200],
-    [128, 128, 5, 5],
-    [8, 32, 256, 256],
-]
-scale_factor = [[3, 3], [3, 3], [7, 7], [7, 7], 3]
-for sp, sf in zip(shape_list, scale_factor):
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        for mode in ["bilinear"]:
-            simple_test(sp, sf, backward, dtype, mode)
+def benchmark(args):
+    for in_shape, scale in zip(shape_list, scale_factor):
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            in_tensor = torch.randn(
+                in_shape, dtype=dtype, device=args.device, requires_grad=backward
+            )
+            cache_r = torch.randn((1024 * 1024 * 1024), device=args.device)
+            cache_w = torch.randn((1024 * 1024 * 1024), device=args.device)
+            # warm_up
+            Bilinear2d(in_tensor, scale, backward, args.device)
+
+            # go
+            print(
+                "shape:",
+                (in_shape),
+                "; datatype:",
+                dtype,
+                "; scale_factor:",
+                scale,
+                "; mode:",
+                "bilinear",
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(
+                    in_tensor,
+                    scale,
+                    backward,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+            if not args.profile_only:
+                run_e2e(
+                    in_tensor,
+                    scale,
+                    backward,
+                    cache_r,
+                    cache_w,
+                    args.device,
+                    args.num_iter,
+                )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/upsample_nearest2d.py b/test/microbench/upsample_nearest2d.py
index 1e0a74aa4a..610bb3cac8 100644
--- a/test/microbench/upsample_nearest2d.py
+++ b/test/microbench/upsample_nearest2d.py
@@ -1,20 +1,19 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
-
 shape_list = [
     (8, 32, 256, 256, (3)),
     (8, 512, 16, 16, (1.5)),
     (16, 1024, 23, 23, (2.3)),
     (4, 32, 80, 128, (2)),
 ]
+backward = True
 
 
-def Interpolate2d(shape, dtype, channels_last, backward, mode):
+def Interpolate2d(shape, dtype, channels_last, backward, mode, device):
     N, C, H, W, scale_factor = shape[0], shape[1], shape[2], shape[3], shape[4]
 
     if channels_last:
@@ -34,14 +33,41 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode):
         output.backward(torch.ones_like(output))
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, mode, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Interpolate2d(shape, dtype, channels_last, backward, mode, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, mode, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Interpolate2d(shape, dtype, channels_last, backward, mode, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 for mode in ["nearest"]:
                     # warm up
-                    Interpolate2d(shape, dtype, channels_last, backward, mode)
+                    Interpolate2d(
+                        shape, dtype, channels_last, backward, mode, args.device
+                    )
 
                     # go
                     print(
@@ -58,20 +84,46 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode):
                         "; backward:",
                         backward,
                     )
-                    with profile(
-                        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                        record_shapes=True,
-                    ) as prof:
-                        for i in range(num_iter):
-                            Interpolate2d(shape, dtype, channels_last, backward, mode)
-                    print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                    # E2E time
-                    torch.xpu.synchronize()
-                    t1 = time.time()
-                    for i in range(num_iter):
-                        Interpolate2d(shape, dtype, channels_last, backward, mode)
-                    torch.xpu.synchronize()
-                    t2 = time.time()
-                    e2e_time = (t2 - t1) / num_iter
-                    print("E2E total time:", f"{float(e2e_time):.20f}")
+                    if not args.e2e_only:
+                        run_profile(
+                            shape,
+                            dtype,
+                            channels_last,
+                            backward,
+                            mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            shape,
+                            dtype,
+                            channels_last,
+                            backward,
+                            mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/upsample_nearest3d.py b/test/microbench/upsample_nearest3d.py
index 841b404296..cab96e0173 100644
--- a/test/microbench/upsample_nearest3d.py
+++ b/test/microbench/upsample_nearest3d.py
@@ -1,19 +1,18 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
-
 shape_list = [
     (8, 32, 256, 256, 2, (3)),
     (8, 512, 16, 16, 4, (1.5)),
     (16, 1024, 23, 23, 7, (2.3)),
 ]
+backward = True
 
 
-def Interpolate3d(shape, dtype, channels_last, backward):
+def Interpolate3d(shape, dtype, channels_last, backward, device):
     N, C, H, W, D, scale_factor = (
         shape[0],
         shape[1],
@@ -42,13 +41,38 @@ def Interpolate3d(shape, dtype, channels_last, backward):
         output.backward(torch.ones_like(output))
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Interpolate3d(shape, dtype, channels_last, backward, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Interpolate3d(shape, dtype, channels_last, backward, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 # warm up
-                Interpolate3d(shape, dtype, channels_last, backward=True)
+                Interpolate3d(shape, dtype, channels_last, backward, args.device)
 
                 # go
                 print(
@@ -63,20 +87,44 @@ def Interpolate3d(shape, dtype, channels_last, backward):
                     "; backward:",
                     backward,
                 )
-                with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                    record_shapes=True,
-                ) as prof:
-                    for i in range(num_iter):
-                        Interpolate3d(shape, dtype, channels_last, backward=True)
-                print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                # E2E time
-                torch.xpu.synchronize()
-                t1 = time.time()
-                for i in range(num_iter):
-                    Interpolate3d(shape, dtype, channels_last, backward=True)
-                torch.xpu.synchronize()
-                t2 = time.time()
-                e2e_time = (t2 - t1) / num_iter
-                print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+                if not args.profile_only:
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
diff --git a/test/microbench/upsample_nearest_exact2d.py b/test/microbench/upsample_nearest_exact2d.py
index 3eb043023e..0ade8e670a 100644
--- a/test/microbench/upsample_nearest_exact2d.py
+++ b/test/microbench/upsample_nearest_exact2d.py
@@ -1,20 +1,19 @@
+import argparse
 import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-num_iter = 20
-
 shape_list = [
     (8, 32, 256, 256, (3)),
     (8, 512, 16, 16, (1.5)),
     (16, 1024, 23, 23, (2.3)),
     (4, 32, 80, 128, (2)),
 ]
+backward = True
 
 
-def Interpolate2d(shape, dtype, channels_last, backward, mode):
+def Interpolate2d(shape, dtype, channels_last, backward, mode, device):
     N, C, H, W, scale_factor = shape[0], shape[1], shape[2], shape[3], shape[4]
 
     if channels_last:
@@ -34,14 +33,41 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode):
         output.backward(torch.ones_like(output))
 
 
-if __name__ == "__main__":
-    backward = True
+def run_profile(shape, dtype, channels_last, backward, mode, device, num_iter):
+    with profile(
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        for i in range(num_iter):
+            Interpolate2d(shape, dtype, channels_last, backward, mode, device)
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
+
+def run_e2e(shape, dtype, channels_last, backward, mode, device, num_iter):
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t1 = time.time()
+    for i in range(num_iter):
+        Interpolate2d(shape, dtype, channels_last, backward, mode, device)
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
+
+
+def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for channels_last in [False, True]:
                 for mode in ["nearest-exact"]:
                     # warm up
-                    Interpolate2d(shape, dtype, channels_last, backward, mode)
+                    Interpolate2d(
+                        shape, dtype, channels_last, backward, mode, args.device
+                    )
 
                     # go
                     print(
@@ -58,20 +84,46 @@ def Interpolate2d(shape, dtype, channels_last, backward, mode):
                         "; backward:",
                         backward,
                     )
-                    with profile(
-                        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                        record_shapes=True,
-                    ) as prof:
-                        for i in range(num_iter):
-                            Interpolate2d(shape, dtype, channels_last, backward, mode)
-                    print(prof.key_averages().table(sort_by="xpu_time_total"))
-
-                    # E2E time
-                    torch.xpu.synchronize()
-                    t1 = time.time()
-                    for i in range(num_iter):
-                        Interpolate2d(shape, dtype, channels_last, backward, mode)
-                    torch.xpu.synchronize()
-                    t2 = time.time()
-                    e2e_time = (t2 - t1) / num_iter
-                    print("E2E total time:", f"{float(e2e_time):.20f}")
+                    if not args.e2e_only:
+                        run_profile(
+                            shape,
+                            dtype,
+                            channels_last,
+                            backward,
+                            mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+                    if not args.profile_only:
+                        run_e2e(
+                            shape,
+                            dtype,
+                            channels_last,
+                            backward,
+                            mode,
+                            args.device,
+                            args.num_iter,
+                        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)