[CI] Add llama case for profile test (#1716)

RUIJIEZHONG66166 · web-flow · commit 076072455eae · 2025-08-13T08:23:34.000Z
Add llama inference test for call number test when profiling
diff --git a/.github/scripts/check_llama_baseline.sh b/.github/scripts/check_llama_baseline.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+file1="$1"
+file2="$2"
+
+if [ ! -f "$file1" ] || [ ! -f "$file2" ]; then
+    echo "Error: One or both files do not exist" >&2
+    exit 1
+fi
+
+if ! diff_output=$(diff <(sort "$file1") <(sort "$file2")); then
+    echo "ERROR: Files $file1 and $file2 differ!" >&2
+    echo "Differences found:" >&2
+    echo "$diff_output" >&2
+fi
+
+echo "SUCCESS: Files $file1 and $file2 are same"
+exit 0
diff --git a/.github/scripts/llama_baseline.csv b/.github/scripts/llama_baseline.csv
@@ -0,0 +1,34 @@
+Operation Name,Section 1 Calls,Section 2 Calls,Section 3 Calls,Section 4 Calls,Section 5 Calls,Total Calls
+urEnqueueKernelLaunch,1590,1590,1590,1590,1590,7950
+"at::native::xpu::VectorizedElementwiseKernel<8, at::...",449,449,449,449,449,2245
+at::native::xpu::ElementwiseGlobalRangeKernel<at::na...,390,390,390,390,390,1950
+gemm_kernel,226,226,226,226,226,1130
+"at::native::xpu::VectorizedElementwiseKernel<4, at::...",210,210,210,210,210,1050
+urUSMDeviceAlloc,164,160,160,160,160,804
+Memset (DEVICE),160,160,160,160,160,800
+urEnqueueUSMFill,160,160,160,160,160,800
+at::native::xpu::UnrolledElementwiseKernel<at::nativ...,138,138,138,138,138,690
+"at::native::xpu::ReduceKernel<1, at::native::xpu::Re...",74,74,74,74,74,370
+micro_sdpa,32,32,32,32,32,160
+"at::native::xpu::VectorizedElementwiseKernel<2, at::...",16,16,16,16,16,80
+"at::native::xpu::VectorizedElementwiseKernel<16, at:...",14,14,14,14,14,70
+urEnqueueUSMMemcpy,10,10,10,10,10,50
+at::native::xpu::SegmentedRadixSortPairsDownsweepFun...,8,8,8,8,8,40
+at::native::xpu::SegmentedRadixSortPairsUpsweepFunct...,8,8,8,8,8,40
+at::native::xpu::SegmentedRadixSortPairsScanFunctor<...,8,8,8,8,8,40
+Memcpy D2M (DEVICE -> MEMORY(Unknown)),6,6,6,6,6,30
+at::native::xpu::SegmentScanKernel<at::native::xpu::...,4,4,4,4,4,20
+at::native::xpu::impl::SoftmaxForwardKernelFunctor<4...,2,2,2,2,2,10
+Memcpy H2D (HOST -> DEVICE),2,2,2,2,2,10
+at::native::xpu::CatArrayBatchedCopyKernelFunctor<lo...,2,2,2,2,2,10
+at::native::xpu::AssertAsyncKernelFunctor1<bool>,2,2,2,2,2,10
+Memcpy M2D (MEMORY(Unknown) -> DEVICE),2,2,2,2,2,10
+detach_,2,2,2,2,2,10
+at::native::xpu::SegmentedGroupRadixSelectPairsFunct...,1,1,1,1,1,5
+at::native::xpu::SegmentedGroupRadixSortPairsFunctor...,1,1,1,1,1,5
+at::native::xpu::ScatterGatherElementwiseKernelFunct...,1,1,1,1,1,5
+"at::native::xpu::ReduceKernel<2, at::native::xpu::Re...",1,1,1,1,1,5
+at::native::xpu::IndexKernel<at::native::xpu::IndexK...,1,1,1,1,1,5
+at::native::xpu::DistributionElementwiseKernelFuncto...,1,1,1,1,1,5
+at::native::xpu::AccumulateCarrierKernelFunctor<at::...,1,1,1,1,1,5
+urUSMHostAlloc,1,0,0,0,0,1
diff --git a/.github/scripts/llama_summary.py b/.github/scripts/llama_summary.py
@@ -0,0 +1,115 @@
+import re
+import csv
+import argparse
+from collections import defaultdict
+
+def parse_log_sections(log_content):
+    sections = []
+    pattern = re.compile(r'^(datatype: torch\.float16 ; i: \d+)(.*?)(?=^datatype: |\Z)', re.MULTILINE | re.DOTALL)
+
+    for match in pattern.finditer(log_content):
+        header = match.group(1).strip()
+        content = match.group(2).strip()
+        if content:
+            sections.append((header, content))
+
+    return sections
+
+def extract_non_aten_data(section_content):
+    pattern = re.compile(
+        r'^\s*([^\s].*?)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+'
+        r'(\d+\.\d+\w*s)\s+(\d+\.\d+\w*s)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+'
+        r'(\d+\.\d+\w*s)\s+(\d+)',
+        re.MULTILINE
+    )
+
+    section_data = []
+    for match in pattern.finditer(section_content):
+        name = match.group(1).strip()
+        if not name.startswith('aten::'):
+            num_calls = int(match.group(11))
+            section_data.append((name, num_calls))
+
+    return section_data
+
+def process_log_file(input_file):
+    with open(input_file) as f:
+        log_content = f.read()
+
+    sections = parse_log_sections(log_content)
+    all_data = defaultdict(dict)
+    section_headers = []
+    duplicate_names = defaultdict(list)
+
+    print("\nFind the test log:")
+    for i, (header, content) in enumerate(sections):
+        print(f"[part {i+1}] {header}")
+        section_headers.append(header)
+        section_data = extract_non_aten_data(content)
+
+        # Track duplicate names within the same section
+        seen_in_section = defaultdict(int)
+        for name, num_calls in section_data:
+            seen_in_section[name] += 1
+            if seen_in_section[name] > 1:
+                duplicate_names[name].append((i, num_calls))
+
+        for name, num_calls in section_data:
+            all_data[name][i] = all_data[name].get(i, 0) + num_calls
+
+    # Print duplicate names and their calls
+    if duplicate_names:
+        print("\nDuplicate names found:")
+        for name, calls in duplicate_names.items():
+            print(f"Name: {name}")
+            for section_idx, num_calls in calls:
+                print(f"  Section {section_idx+1}: {num_calls} calls")
+    else:
+        print("\nNo duplicate names found.")
+
+    return all_data, section_headers
+
+def write_to_csv(data, section_headers, output_file):
+    with open(output_file, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+
+        headers = ['Operation Name'] + [f"Section {i+1} Calls" for i in range(len(section_headers))] + ['Total Calls']
+        writer.writerow(headers)
+        for name, calls_data in sorted(data.items(), key=lambda x: sum(x[1].values()), reverse=True):
+            row = [name]
+            total = 0
+            for i in range(len(section_headers)):
+                calls = calls_data.get(i, 0)
+                row.append(str(calls))
+                total += calls
+            row.append(str(total))
+            writer.writerow(row)
+
+    print(f"\nGenerated result CSV file: {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(description='obtain the calls of nono aten op')
+    parser.add_argument('-i', '--input', required=True, help='input log path')
+    parser.add_argument('-o', '--output', default='output.csv',
+                       help='output summary file')
+
+    args = parser.parse_args()
+
+    try:
+        print(f"\nProcessing the log file: {args.input}")
+        csv_data, section_headers = process_log_file(args.input)
+
+        if csv_data:
+            write_to_csv(csv_data, section_headers, args.output)
+            print("\nThe summary of none aten op:")
+            for name, calls in csv_data.items():
+                print(f"{name}: {calls}")
+        else:
+            print("Warning: No none aten op")
+    except FileNotFoundError:
+        print(f"Error: Input file {args.input} not found")
+    except Exception as e:
+        print(f"Error when processing the Input file: {str(e)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -147,36 +147,43 @@ jobs:
               pip install pytest pytest-timeout
           - name: 'xpu_profiling'
             condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }}
+            directory: '$GITHUB_WORKSPACE'
             command_script: |
+              mkdir -p $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce
               # RN50 Test
               PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0
-              cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test
+              cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/xpu_profiling
 
               # All Issue Reproduce UT
               python -u test/profiling/correlation_id_mixed.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log
               python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log
               python -u test/profiling/time_precision_in_profile.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log
               python -u test/profiling/profile_partial_runtime_ops.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log
               python -u test/profiling/triton_xpu_ops_time.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log
-
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log
+              
+              # llama case for calls number test
+              python test/profiling/llama.py | \
+                tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log
+              python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
+              bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
+              
               # All xpu ut under test/profiler
               cd ../pytorch/test/profiler
               python -m pytest --timeout 600 -vs test_cpp_thread.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_cpp_thread.log
               python -m pytest --timeout 600 -vs test_execution_trace.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_execution_trace.log
               python -m pytest --timeout 600 -vs test_memory_profiler.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_memory_profiler.log
               python -m pytest --timeout 600 -vs test_profiler_tree.py | \
-                tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log
+                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_profiler_tree.log
             additional_steps: |
-              pip install pytest pytest-timeout
-              mkdir -p ut_log/profile_test/issue_reproduce
+              pip install pytest pytest-timeout transformers
     outputs: 
       ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }}
     steps:
diff --git a/test/profiling/llama.py b/test/profiling/llama.py
@@ -0,0 +1,31 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+)
+model.eval().to("xpu")
+
+prompt = "If Alice is older than Bob, and Bob is older than Charlie, who is the youngest? Explain your reasoning."
+inputs = tokenizer(prompt, return_tensors="pt").to("xpu")
+
+with torch.no_grad():
+    for i in range(5):
+        print(
+            "datatype:",
+            torch.float16,
+            "; i:",
+            i,
+        )
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.XPU,
+            ]
+        ) as prof:
+            outputs = model.generate(**inputs, max_new_tokens=1)
+        print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=-1))