diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml
new file mode 100644
index 0000000000..c55ca37cc6
--- /dev/null
+++ b/.github/actions/get-runner/action.yml
@@ -0,0 +1,47 @@
+name: Get Runner Infos
+
+outputs:
+  runner_id:
+    value: ${{ steps.runner.outputs.runner_id }}
+  user_id:
+    value: ${{ steps.runner.outputs.user_id }}
+  render_id:
+    value: ${{ steps.runner.outputs.render_id }}
+  hostname:
+    value: ${{ steps.runner.outputs.hostname }}
+
+permissions: read-all
+
+runs:
+  using: composite
+  steps:
+    - name: Get runner
+      shell: bash -xe {0}
+      id: runner
+      run: |
+        # get test runner
+        echo "runner_id=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT}
+        echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
+        echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
+        echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
+        # show host info
+        lscpu
+        lshw -C display
+        free -h
+        df -h
+        cat /etc/os-release
+        uname -a
+    - name: Cleanup host
+      shell: bash -xe {0}
+      run: |
+        # clean docker cache
+        docker system prune -af || true
+        # clean workspace
+        ls -al
+        sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+        cd ${RUNNER_WORKSPACE}/..
+        if [ "${PWD}" != "/" ];then
+          ls -al
+          sudo chmod 777 -R torch-xpu-ops _temp _actions _tool || true
+          sudo rm -rf _temp
+        fi
diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
deleted file mode 100644
index 1631f399f2..0000000000
--- a/.github/actions/inductor-xpu-e2e-test/action.yml
+++ /dev/null
@@ -1,185 +0,0 @@
-name: inductor-xpu-e2e-test
-
-inputs:
-  suite:
-    required: true
-    type: string
-    default: 'huggingface'
-    description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma
-  env_prepare:
-    required: false
-    description: If set to any value, will prepare suite test env
-  dt:
-    required: true
-    type: string
-    default: 'float32'
-    description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma
-  mode:
-    required: true
-    type: string
-    default: 'inference'
-    description: inference,training. Delimiter is comma
-  scenario:
-    required: true
-    type: string
-    default: 'accuracy'
-    description: accuracy,performance. Delimiter is comma
-  cards:
-    required: false
-    type: string
-    default: 'all'
-    description: which cards can be used in the test
-  hf_token:
-    required: false
-    description: HUGGING_FACE_HUB_TOKEN for torchbench test
-  pytorch:
-    required: false
-    type: string
-    default: 'main'
-    description: Pytorch branch/commit
-  driver:
-    required: false
-    type: string
-    default: 'lts'
-    description: Driver lts/rolling
-
-runs:
-  using: composite
-  steps:
-    - name: Prepare ENV
-      if: ${{ inputs.env_prepare }}
-      shell: bash
-      run: |
-        source activate e2e_ci
-        if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git
-            cd audio && git checkout $TORCHAUDIO_COMMIT_ID
-            python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl
-            cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git
-            cd vision && git checkout $TORCHVISION_COMMIT_ID
-            python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl
-          fi
-          cd ../ && python -c "import torch, torchvision, torchaudio"
-          rm -rf benchmark && git clone https://github.com/pytorch/benchmark.git
-          cd benchmark && git checkout $TORCHBENCH_COMMIT_ID
-          # remove deps which will reinstall torch
-          pip install --no-deps accelerate
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID
-          pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch)
-          pip install -U transformers==4.44.2
-          sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g'  requirements.txt
-          git status && git diff
-          pip install -r requirements.txt
-          python install.py --continue_on_fail
-          # deps for torchrec_dlrm
-          pip install pyre_extensions
-          pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu
-          pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec
-        fi
-        if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then
-          pip install -U transformers==4.44.2
-        fi
-        if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git
-            cd vision && git checkout $TORCHVISION_COMMIT_ID
-            python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl
-          fi
-          # install timm without dependencies
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID
-          # install timm dependencies without torch and torchvision
-          pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch)
-        fi
-        pip install numpy==1.26.4
-    - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }}
-        NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-        DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      shell: bash
-      run: |
-        source activate e2e_ci
-        cp .github/scripts/inductor_xpu_test.sh ../pytorch
-        cd ../pytorch
-
-        # check param
-        function contains() {
-            contains_status="echo 'Start $2 ...'"
-            {
-              [[ $1 =~ (^|,)$2($|,) ]]
-            } || {
-              echo "[Warning] $2 is not suppotted type! Skipped!"
-              contains_status="continue"
-            }
-        }
-        set -xe
-        xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l)
-        cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')"
-        export OMP_NUM_THREADS=${cores_per_instance}
-        for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
-        do
-          if [ "${suite}" == "pt2e" ];then
-            continue
-          fi
-          contains "huggingface,timm_models,torchbench" $suite
-          $contains_status
-          for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g')
-          do
-            contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt
-            $contains_status
-            for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g')
-            do
-              contains "inference,training" $mode
-              $contains_status
-              for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g')
-              do
-                contains "accuracy,performance" $scenario
-                $contains_status
-                if [ "${MODEL_ONLY_NAME}" == "" ];then
-                  for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ])
-                  do
-                    cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')"
-                    numactl --localalloc --physcpubind=${cpu_list} bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} &
-                  done
-                else
-                  for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g')
-                  do
-                    numactl --localalloc bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model}
-                  done
-                fi
-                wait
-                # summarize pass rate
-                LOG_DIR="inductor_log/${suite}/${dt}"
-                LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log
-                rm -f ${LOG_DIR}/${LOG_NAME}
-                find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1
-              done
-            done
-          done
-        done
-
-    - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }}
-      shell: bash
-      run: |
-        cd ../pytorch
-        rm -f inductor_log/summary_accuracy.csv
-        for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv")
-        do
-          sed -i "s/$/,$(basename $var)/" $var
-          cat $var >> inductor_log/summary_accuracy.csv
-        done
-
-        source activate e2e_ci
-        cd ${{ github.workspace }}
-        cp .github/scripts/inductor_summary.py ../pytorch
-        cd ../pytorch
-        pip install styleFrame scipy pandas
-        set -xe
-        dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g')
-        mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g')
-        suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g')
-        scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g')
-        python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario}
diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml
new file mode 100644
index 0000000000..52ef8a4cc9
--- /dev/null
+++ b/.github/actions/linux-e2etest/action.yml
@@ -0,0 +1,111 @@
+name: Linux E2E Test
+
+inputs:
+  env_prepare:
+    required: false
+    description: If set to any value, will prepare suite test env
+  suite:
+    required: true
+    type: string
+    default: 'huggingface'
+    description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma
+  dt:
+    required: true
+    type: string
+    default: 'float32'
+    description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma
+  mode:
+    required: true
+    type: string
+    default: 'inference'
+    description: inference,training. Delimiter is comma
+  scenario:
+    required: true
+    type: string
+    default: 'accuracy'
+    description: accuracy,performance. Delimiter is comma
+
+runs:
+  using: composite
+  steps:
+    - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      shell: bash -x {0}
+      run: |
+        pip list |grep -E 'intel|torch'
+        cp ./.github/scripts/inductor_xpu_test.sh ./pytorch
+        cd ./pytorch
+        # check param
+        function contains() {
+            contains_status="echo 'Start $2 ...'"
+            {
+              [[ $1 =~ (^|,)$2($|,) ]]
+            } || {
+              echo "[Warning] $2 is not suppotted type! Skipped!"
+              contains_status="continue"
+            }
+        }
+        xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l)
+        cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')"
+        export OMP_NUM_THREADS=${cores_per_instance}
+        for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
+        do
+          if [ "${suite}" == "pt2e" ];then
+            continue
+          fi
+          contains "huggingface,timm_models,torchbench" $suite
+          $contains_status
+          for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g')
+          do
+            contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt
+            $contains_status
+            for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g')
+            do
+              contains "inference,training" $mode
+              $contains_status
+              for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g')
+              do
+                contains "accuracy,performance" $scenario
+                $contains_status
+                if [ "${MODEL_ONLY_NAME}" == "" ];then
+                  for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ])
+                  do
+                    cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')"
+                    numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} &
+                  done
+                else
+                  for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g')
+                  do
+                    numactl --localalloc bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model}
+                  done
+                fi
+                wait
+                # summarize pass rate
+                LOG_DIR="inductor_log/${suite}/${dt}"
+                LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log
+                rm -f ${LOG_DIR}/${LOG_NAME}
+                find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1
+              done
+            done
+          done
+        done
+
+    - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      shell: bash -xe {0}
+      run: |
+        cd ./pytorch
+        rm -f inductor_log/summary_accuracy.csv
+        for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv")
+        do
+          sed -i "s/$/,$(basename $var)/" $var
+          cat $var >> inductor_log/summary_accuracy.csv
+        done
+        cp ${{ github.workspace }}/.github/scripts/inductor_summary.py ./
+        csv_file="$(find inductor_log/ -name "inductor_*_xpu_*.csv" |tail -n 1)"
+        if [ -f "${csv_file}" ];then
+          pip install styleFrame scipy pandas
+          dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g')
+          mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g')
+          suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g')
+          scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g')
+          python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario}
+        fi
diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
new file mode 100644
index 0000000000..188dacc29b
--- /dev/null
+++ b/.github/actions/linux-testenv/action.yml
@@ -0,0 +1,168 @@
+name: Setup Test Environment
+
+inputs:
+  test_type:
+    required: true
+    type: string
+    description: Test scope
+  pytorch:
+    type: string
+    default: 'main'
+    description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+  torch_xpu_ops:
+    type: string
+    default: 'main'
+    description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
+  oneapi:
+    type: string
+    default: 'installed'
+    description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
+  python:
+    type: string
+    default: '3.10'
+    description: Python version
+  suite:
+    type: string
+    default: 'huggingface'
+    description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
+
+permissions: read-all
+
+runs:
+  using: composite
+  steps:
+    - name: Setup python-${{ inputs.python }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.python }}
+    - name: Check runner
+      shell: bash -xe {0}
+      run: |
+        hostname && id
+        cat /etc/os-release
+        gcc -v && g++ -v
+        which python && python -V
+        which pip && pip list
+        pip install -U pip wheel setuptools
+        uname -a
+        dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
+        clinfo --list
+        cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c
+        rm -rf ~/.triton /tmp/*inductor*
+    - name: Checkout torch-xpu-ops
+      uses: actions/checkout@v4
+      with:
+        path: torch-xpu-ops
+    - name: Install oneAPI DLE
+      shell: bash -xe {0}
+      if: ${{ inputs.oneapi != 'installed' }}
+      run: |
+        rm -rf ~/intel ~/.intel
+        wget -q -O oneapi.sh "${{ inputs.oneapi }}"
+        bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi
+        echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV}
+        source ${HOME}/intel/oneapi/setvars.sh
+        sycl-ls && icpx -v
+    - name: Download Pytorch wheel
+      if: ${{ ! contains(inputs.test_type, 'wheel') }}
+      uses: actions/download-artifact@v4
+      with:
+        pattern: Torch-XPU-Wheel-*
+    - name: Install E2E Requirements
+      if: ${{ contains(inputs.test_type, 'e2e') }}
+      shell: bash -xe {0}
+      run: |
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+        pip install pandas psutil scipy
+        if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then
+          pip install transformers==4.44.2
+        elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
+          pip install timm==1.0.14
+        elif [[ "${{ inputs.suite }}" == *"torchbench"* ]];then
+          rm -rf ./benchmark
+          git clone https://github.com/pytorch/benchmark
+          cd benchmark
+          git checkout e03a63be43e33596f7f0a43b0f530353785e4a59
+          pip install -r requirements.txt
+          pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
+          python install.py --continue_on_fail
+        elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then
+          rm -rf ./benchmark
+          git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark
+          cd benchmark
+          pip install -r requirements.txt
+          pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
+          python install.py --continue_on_fail
+        fi
+        pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton
+        pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton
+    - name: Prepare Stock Pytorch
+      shell: bash -xe {0}
+      run: |
+        # install pytorch
+        if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then
+          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu
+        elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then
+          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu
+        elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then
+          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+        else
+          pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl")
+        fi
+        TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+        if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then
+          PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
+        else
+          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+        fi
+        git clone ${PYTORCH_REPO} pytorch
+        cd pytorch
+        git checkout ${TORCH_COMMIT_ID}
+        # apply extra PRs for stock pytorch
+        pip install requests
+        if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
+        else
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+        fi
+        git status && git diff && git show -s
+    - name: Prepare Torch-xpu-ops
+      shell: bash -xe {0}
+      if: ${{ inputs.torch_xpu_ops != 'skipped' }}
+      run: |
+        cd pytorch
+        rm -rf third_party/torch-xpu-ops
+        if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then
+          TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')"
+          TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')"
+        else
+          TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
+          if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then
+            TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)"
+          else
+            TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
+          fi
+        fi
+        if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ] || [ "${{ inputs.torch_xpu_ops }}" == "triggered" ];then
+          cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops
+          cd third_party/torch-xpu-ops
+        else
+          git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops
+          cd third_party/torch-xpu-ops
+          git checkout ${TORCH_XPU_OPS_COMMIT}
+        fi
+        git status && git diff && git show -s
+    - name: Torch Config
+      shell: bash -xe {0}
+      run: |
+        printenv
+        python -c "import torch; print(torch.__config__.show())"
+        python -c "import torch; print(torch.__config__.parallel_info())"
+        python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+        python -c "import torchvision; print(torchvision.__version__)"
+        python -c "import torchaudio; print(torchaudio.__version__)"
+        python -c "import triton; print(triton.__version__)"
+        python pytorch/torch/utils/collect_env.py
+        pip list |grep -E 'torch|intel'
diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
new file mode 100644
index 0000000000..d522dc4691
--- /dev/null
+++ b/.github/actions/linux-uttest/action.yml
@@ -0,0 +1,176 @@
+name: Linux Unit Test
+
+inputs:
+  test_type:
+    required: true
+    type: string
+    description: Test scope
+
+permissions: read-all
+
+runs:
+  using: composite
+  steps:
+    - name: requirements
+      shell: bash -xe {0}
+      run: |
+        pip install -r pytorch/.ci/docker/requirements-ci.txt
+        pip install -U pytest-timeout pytest-xdist
+        xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l)
+        parallel_options=""
+        if [ ${xpu_num} -gt 1 ];then
+          parallel_options+=" --dist worksteal "
+          for x in $(seq 0 $[ ${xpu_num} - 1 ])
+          do
+            parallel_options+=" --tx popen//env:ZE_AFFINITY_MASK=${x} "
+          done
+        else
+          parallel_options+=" -n 1 "
+        fi
+        printf " --timeout 600 --timeout_method=thread ${parallel_options} " > ${{ github.workspace }}/test-options.txt
+    - name: ut_regression
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_regression' }}
+      run: |
+        mkdir -p ut_log/ut_regression
+        cd pytorch/third_party/torch-xpu-ops/test/regressions
+        pytest $(cat ${{ github.workspace }}/test-options.txt) -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \
+          2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log
+    - name: ut_transformers
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_transformers' }}
+      run: |
+        export PYTORCH_TEST_WITH_SLOW=1
+        mkdir -p ut_log/ut_transformers
+        cd pytorch
+        pytest $(cat ${{ github.workspace }}/test-options.txt) -v test/test_transformers.py -k xpu \
+          --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \
+          2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log
+    - name: ut_extended
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_extended' }}
+      run: |
+        export PYTORCH_TEST_WITH_SLOW=1
+        mkdir -p ut_log/ut_extended
+        cd pytorch/third_party/torch-xpu-ops/test/xpu/extended
+        python run_test_with_skip.py \
+          2> ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test.log
+        ls -al
+        cp *.xml ${{ github.workspace }}/ut_log
+    - name: ut_op
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_op' }}
+      run: |
+        export PYTORCH_TEST_WITH_SLOW=1
+        export PYTORCH_ENABLE_XPU_FALLBACK=1
+        mkdir -p ut_log/ut_op
+        cd pytorch/third_party/torch-xpu-ops/test/xpu
+        python run_test_with_skip.py \
+          2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test.log
+        ls -al
+        cp *.xml ${{ github.workspace }}/ut_log
+        find ut_op_with_skip_nn ut_op_with_skip_quantization/core -type f -exec sh -c '
+            dir_path=$(dirname "$1");
+            case "$dir_path" in
+                *"ut_op_with_skip_quantization/core"*)
+                    dir_name="ut_op_with_skip_quantization_core";;
+                *)
+                    dir_name=$(basename "$dir_path");;
+            esac;
+            mv "$1" "$dir_path/${dir_name}_$(basename "$1")"
+        ' _ {} \;
+        ls -al ut_op_with_skip_nn ut_op_with_skip_quantization/core
+        cp ut_op_with_skip_nn/*.xml ${{ github.workspace }}/ut_log
+        cp ut_op_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log
+        # Cases run with a on-demand white list, since some suites are too
+        # slow to go through all operators on CPU. So add cases on-demand
+        # when XPU implementatoin is done.
+        # test_foreach, test_decomp
+        # Run with only
+        python run_test_with_only.py \
+          2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test.log
+        ls -al
+        cp *.xml ${{ github.workspace }}/ut_log
+    - name: ut_torch
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_torch' }}
+      run: |
+        export PYTORCH_TEST_WITH_SLOW=1
+        export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
+        mkdir -p ut_log/ut_torch
+        cd pytorch
+        test_cmd="python test/run_test.py --include "
+        for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done
+        for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done
+        if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi
+        eval $test_cmd 2> ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test.log
+    - name: ut_profiling
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'ut_profiling' }}
+      run: |
+        mkdir -p ut_log/xpu_profiling/issue_reproduce
+        cd pytorch/third_party/torch-xpu-ops
+        # RN50 Test
+        PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0
+        cp profiling.fp32.train.pt ${{ github.workspace }}/ut_log/xpu_profiling
+        # All Issue Reproduce UT
+        python -u test/profiling/correlation_id_mixed.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log
+        python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log
+        python -u test/profiling/time_precision_in_profile.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log
+        python -u test/profiling/profile_partial_runtime_ops.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log
+        python -u test/profiling/triton_xpu_ops_time.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log
+
+        # llama case for calls number test
+        pip install transformers
+        python test/profiling/llama.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log
+        python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
+        bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
+
+        # All xpu ut under test/profiler
+        cd ../../test/profiler
+        python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_cpp_thread.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/test_cpp_thread.log
+        python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_execution_trace.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/test_execution_trace.log
+        python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_memory_profiler.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/test_memory_profiler.log
+        python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_profiler_tree.py | \
+          tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
+
+    - name: xpu_dev1
+      shell: bash -xe {0}
+      if: ${{ inputs.test_type == 'xpu_dev1' }}
+      run: |
+        mkdir -p ut_log/xpu_dev1
+        cd pytorch/third_party/torch-xpu-ops/test/regressions
+        pytest --timeout 200 -v test_operation_on_device_1.py \
+          --junit-xml=${{ github.workspace }}/ut_log/xpu_dev1.xml \
+          2> ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test.log
+
+    - name: xpu_distributed
+      shell: bash -x -e -o pipefail {0}
+      if: ${{ inputs.test_type == 'xpu_distributed' }}
+      run: |
+        mkdir -p ut_log/xpu_distributed
+        cd pytorch/third_party/torch-xpu-ops/test/xpu
+        XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+        if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then
+          echo -e "[ERROR] XCCL is not enabled"
+          exit 1
+        fi
+        timeout 1800 python run_distributed.py \
+          2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
+          tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml
index ac4067e7ce..5fc3a9993c 100644
--- a/.github/actions/pt2e/action.yml
+++ b/.github/actions/pt2e/action.yml
@@ -14,28 +14,19 @@ inputs:
     type: string
     default: 'accuracy'
     description: accuracy,performance. Delimiter is comma
-  hf_token:
-    required: false
-    description: HUGGING_FACE_HUB_TOKEN for torchbench test
   pytorch:
     required: false
     type: string
     default: 'main'
     description: Pytorch branch/commit
-  driver:
-    required: false
-    type: string
-    default: 'lts'
-    description: Driver lts/rolling
 
 runs:
   using: composite
   steps:
     - name: Prepare ENV
       if: ${{ inputs.env_prepare }}
-      shell: bash
+      shell: bash -xe {0}
       run: |
-        source activate e2e_ci
         # accuracy code
         if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then
           rm -rf pt2e-accuracy
@@ -46,38 +37,6 @@ runs:
           rm -rf pt2e-performance
           git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark pt2e-performance
         fi
-        # deps
-        if [[ ${{ inputs.scenario }} == *"performance"* ]]; then
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            rm -rf pt2e-audio
-            git clone --single-branch -b main https://github.com/pytorch/audio pt2e-audio
-            cd pt2e-audio && git checkout $TORCHAUDIO_COMMIT_ID
-            python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl
-            cd ../
-            rm -rf pt2e-vision
-            git clone --single-branch -b main https://github.com/pytorch/vision pt2e-vision
-            cd pt2e-vision && git checkout $TORCHVISION_COMMIT_ID
-            python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl
-            cd ../
-          fi
-          # torchbench
-          python -c "import torch, torchvision, torchaudio"
-          cd pt2e-performance
-          # remove deps which will reinstall torch
-          pip install --no-deps accelerate
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID
-          pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch)
-          pip install -U transformers==4.44.2
-          sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g'  requirements.txt
-          git status && git diff
-          pip install -r requirements.txt
-          python install.py --continue_on_fail
-          # deps for torchrec_dlrm
-          pip install pyre_extensions
-          pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu
-          pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec
-        fi
-        pip install numpy==1.26.4
         # dataset
         if [ ! -d ${HOME}/datasets/imagenet ];then
           rm -rf ${HOME}/datasets/imagenet
@@ -89,14 +48,8 @@ runs:
           bash valprep.sh
         fi
     - name: PT2E Test (${{ inputs.dt }}  ${{ inputs.scenario }})
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }}
-        NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-        DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      shell: bash
+      shell: bash -xe {0}
       run: |
-        source activate e2e_ci
-        set -xe
         pt2e_logs_dir="${{ github.workspace }}/../pytorch/inductor_log/pt2e"
         rm -rf "${pt2e_logs_dir}" && mkdir -p "${pt2e_logs_dir}"
         echo "Mode,Model,Dtype,Result" |tee ${pt2e_logs_dir}/summary.csv
@@ -107,14 +60,14 @@ runs:
           do
             if [[ "${{ inputs.dt }}" == *"float32"* ]];then
               ${cmd_line} --model_list ${model_name} --is_fp32 2>&1 |tee "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || true
-              grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" '
+              (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" '
                       BEGIN{acc1 = "failed"; acc5 = "failed";}
                       {acc1 = $(NF - 2); acc5 = $NF;}
                       END{printf("Accuracy,%s,float32,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv
             fi
             if [[ "${{ inputs.dt }}" == *"int8"* ]];then
               ${cmd_line}  --model_list ${model_name} 2>&1 |tee "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || true
-              grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" '
+              (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" '
                       BEGIN{acc1 = "failed"; acc5 = "failed";}
                       {acc1 = $(NF - 2); acc5 = $NF;}
                       END{printf("Accuracy,%s,int8,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv
diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py
index 48c09606de..3c82666af0 100644
--- a/.github/ci_expected_accuracy/check_expected.py
+++ b/.github/ci_expected_accuracy/check_expected.py
@@ -6,7 +6,7 @@
 # Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223
 
 parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--driver", type=str, default="lts", help="rolling or lts")
+parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts")
 parser.add_argument("--category", type=str, default="inductor", help="inductor")
 parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench")
 parser.add_argument("--mode", type=str, required=True, help="inference or training")
diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh
index 5bce6eacdf..001e5c9b44 100755
--- a/.github/scripts/build.sh
+++ b/.github/scripts/build.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 # Usage:
 #   ./build.sh --WORKSPACE=<path/to/dir> \
-#       --PYTORCH_REPO=<pytorch repo url> --PYTORCH_VERSION=<pytorch branch or commit> \
+#       --PYTORCH_REPO=<pytorch repo url> --PYTORCH_COMMIT=<pytorch branch or commit> \
 #       --TORCH_XPU_OPS_REPO=<torch-xpu-ops repo url> \
-#       --TORCH_XPU_OPS_VERSION=<torch-xpu-ops branch, commit or pinned(use pytorch pinned commit)>
+#       --TORCH_XPU_OPS_COMMIT=<torch-xpu-ops branch, commit or pinned(use pytorch pinned commit)>
 set -xe
 export GIT_PAGER=cat
 
 # Init params
 WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"})
 PYTORCH_REPO=${PYTORCH_REPO:-"https://github.com/pytorch/pytorch.git"}
-PYTORCH_VERSION=${PYTORCH_VERSION:-"main"}
+PYTORCH_COMMIT=${PYTORCH_COMMIT:-"main"}
 TORCH_XPU_OPS_REPO=${TORCH_XPU_OPS_REPO:-"https://github.com/intel/torch-xpu-ops.git"}
-TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"}
+TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT:-"main"}
 for var; do
     eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")"
 done
@@ -21,20 +21,20 @@ done
 rm -rf ${WORKSPACE}/pytorch
 git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch
 cd ${WORKSPACE}/pytorch
-git checkout ${PYTORCH_VERSION}
+git checkout ${PYTORCH_COMMIT}
 git remote -v && git branch && git show -s
 git rev-parse HEAD > ${WORKSPACE}/pytorch.commit
 
 # Set torch-xpu-ops
-if [ "${TORCH_XPU_OPS_VERSION,,}" == "pinned" ];then
+if [ "${TORCH_XPU_OPS_COMMIT,,}" == "pinned" ];then
     TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
-    TORCH_XPU_OPS_VERSION="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)"
+    TORCH_XPU_OPS_COMMIT="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)"
 fi
-if [ "${TORCH_XPU_OPS_VERSION,,}" != "cicd" ];then
+if [ "${TORCH_XPU_OPS_COMMIT,,}" != "cicd" ];then
     rm -rf ${WORKSPACE}/torch-xpu-ops
     git clone ${TORCH_XPU_OPS_REPO} ${WORKSPACE}/torch-xpu-ops
     cd ${WORKSPACE}/torch-xpu-ops
-    git checkout ${TORCH_XPU_OPS_VERSION}
+    git checkout ${TORCH_XPU_OPS_COMMIT}
 fi
 cd ${WORKSPACE}/torch-xpu-ops
 git remote -v && git branch && git show -s
@@ -48,30 +48,32 @@ python -m pip install requests
 python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
 git submodule sync && git submodule update --init --recursive
 python -m pip install -r requirements.txt
-python -m pip install mkl-static mkl-include
+python -m pip install mkl-static==2025.1.0 mkl-include==2025.1.0
 export USE_STATIC_MKL=1
-export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
-    intel-cmplr-lib-rt==2025.1.1 | \
-    intel-cmplr-lib-ur==2025.1.1 | \
-    intel-cmplr-lic-rt==2025.1.1 | \
-    intel-sycl-rt==2025.1.1 | \
-    oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \
-    oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \
-    impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \
-    onemkl-sycl-blas==2025.1.0 | \
-    onemkl-sycl-dft==2025.1.0 | \
-    onemkl-sycl-lapack==2025.1.0 | \
-    onemkl-sycl-rng==2025.1.0 | \
-    onemkl-sycl-sparse==2025.1.0 | \
-    dpcpp-cpp-rt==2025.1.1 | \
-    intel-opencl-rt==2025.1.1 | \
-    mkl==2025.1.0 | \
-    intel-openmp==2025.1.1 | \
-    tbb==2022.1.0 | \
-    tcmlib==1.3.0 | \
-    umf==0.10.0 | \
-    intel-pti==0.12.3
-"
+if [ "${XPU_ONEAPI_PATH}" == "" ];then
+    export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
+        intel-cmplr-lib-rt==2025.1.1 | \
+        intel-cmplr-lib-ur==2025.1.1 | \
+        intel-cmplr-lic-rt==2025.1.1 | \
+        intel-sycl-rt==2025.1.1 | \
+        impi-rt==2021.15.0 | \
+        dpcpp-cpp-rt==2025.1.1 | \
+        oneccl-devel==2021.15.2 | \
+        oneccl==2021.15.2 | \
+        mkl==2025.1.0 | \
+        onemkl-sycl-blas==2025.1.0 | \
+        onemkl-sycl-dft==2025.1.0 | \
+        onemkl-sycl-lapack==2025.1.0 | \
+        onemkl-sycl-rng==2025.1.0 | \
+        onemkl-sycl-sparse==2025.1.0 | \
+        intel-opencl-rt==2025.1.1 | \
+        intel-openmp==2025.1.1 | \
+        tbb==2022.1.0 | \
+        tcmlib==1.3.0 | \
+        umf==0.10.0 | \
+        intel-pti==0.12.3
+    "
+fi
 
 # Build
 sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index c9afb73eb8..ff78bffe12 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -186,14 +186,14 @@ def parse_log_file(log_file):
     return summary
 
 def determine_category(ut):
-    if ut == 'op_regression':
-        return 'op_regression'
-    elif ut == 'op_regression_dev1':
-        return 'op_regression_dev1'
-    elif ut == 'op_extended':
-        return 'op_extended'
-    elif 'op_ut' in ut:
-        return 'op_ut'
+    if ut == 'ut_regression':
+        return 'ut_regression'
+    elif ut == 'xpu_dev1':
+        return 'xpu_dev1'
+    elif ut == 'ut_extended':
+        return 'ut_extended'
+    elif 'ut_op' in ut:
+        return 'ut_op'
     else:
         return 'unknown'
 
diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh
index c858f6f3f5..d4ad299b59 100644
--- a/.github/scripts/e2e_summary.sh
+++ b/.github/scripts/e2e_summary.sh
@@ -98,7 +98,7 @@ Empty means the cases NOT run\n\n"
         suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')"
         mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')"
         dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')"
-        python "${check_file}" --driver "${LTS_OR_ROLLING:-"lts"}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt"
+        python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt"
         test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{
             if($0 ~/Total/){
                 total = $3;
diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
old mode 100644
new mode 100755
index 3b17170385..d0f7cfd338
--- a/.github/scripts/env.sh
+++ b/.github/scripts/env.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}
+XPU_ONEAPI_PATH="${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}"
 
 source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh
 source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 52baa15dd0..dd399471d9 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-ut_suite="${1:-op_regression}"   # op_regression / op_extended / op_ut / torch_xpu
+ut_suite="${1:-ut_regression}"   # ut_regression / ut_extended / ut_op / ut_torch
 
 # usage
 # compare_and_filter_logs <UT'log> <Known_issue.log> [output.log]
@@ -86,15 +86,12 @@ check_passed_known_issues() {
     local file_passed_UT="$1"
     local file_known_issue="$2"
     local output_file="${3:-${file_passed_UT%.*}_passed_known_issues.log}"
-
     if [[ $# -lt 2 ]]; then
         echo "[ERROR] Need 2 files to compare"
         return 1
     fi
-
     echo "Checking for known issues that are now passing in $file_passed_UT"
     grep -Fxf "$file_passed_UT" "$file_known_issue" > "$output_file"
-
     echo -e "\n\033[1;32m[New passed cases Summary]\033[0m"
     if [[ -s "$output_file" ]]; then
         cat "$output_file"
@@ -104,7 +101,7 @@ check_passed_known_issues() {
     fi
 }
 
-if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then
+if [[ "${ut_suite}" == 'ut_regression' || "${ut_suite}" == 'xpu_dev1' || "${ut_suite}" == 'ut_extended' || "${ut_suite}" == 'ut_transformers' ]]; then
     grep -E "FAILED" "${ut_suite}"_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_failed.log
     grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_failed.log
     grep -E "Timeout" "${ut_suite}"_test.log | grep "test" >> ./"${ut_suite}"_failed.log
@@ -134,7 +131,7 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'op_ut' ]]; then
+if [[ "${ut_suite}" == 'ut_op' ]]; then
     grep -E "FAILED" op_ut_with_skip_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_skip_test_failed.log
     grep -E "have failures" op_ut_with_skip_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_skip_test_failed.log
     grep -E "Timeout" op_ut_with_skip_test.log | grep "test" >> ./"${ut_suite}"_with_skip_test_failed.log
@@ -178,8 +175,8 @@ if [[ "${ut_suite}" == 'op_ut' ]]; then
       num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed.log")
     fi
     ((num_failed=num_failed_with_skip+num_failed_with_only))
-    grep "PASSED" op_ut_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log
-    grep "PASSED" op_ut_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log
+    grep "PASSED" ut_op_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log
+    grep "PASSED" ut_op_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log
     num_passed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_passed.log")
     num_passed_with_only=$(wc -l < "./${ut_suite}_with_only_test_passed.log")
     ((num_passed=num_passed_with_skip+num_passed_with_only))
@@ -190,13 +187,13 @@ if [[ "${ut_suite}" == 'op_ut' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'torch_xpu' ]]; then
+if [[ "${ut_suite}" == 'ut_torch' ]]; then
     echo "Pytorch XPU binary UT checking"
     cd ../../pytorch || exit
     for xpu_case in build/bin/*{xpu,sycl}*; do
       if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
         case_name=$(basename "$xpu_case")
-        cd ../ut_log/torch_xpu || exit
+        cd ../ut_log/ut_torch || exit
         grep -E "FAILED|have failures" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_failed.log
         wc -l < "./binary_ut_${ut_suite}_${case_name}_failed.log" | tee -a ./binary_ut_"${ut_suite}"_failed_summary.log
         grep -E "PASSED|Pass" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_passed.log
@@ -207,7 +204,7 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
-    cd ../ut_log/torch_xpu || exit
+    cd ../ut_log/ut_torch || exit
     cat "./binary_ut_${ut_suite}_${case_name}_failed.log"
     num_failed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_failed_summary.log)
     num_passed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_passed_summary.log)
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 8fbed99275..ccbac87b3d 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -3,165 +3,215 @@ name: Linux PyTorch XPU Build
 on:
   workflow_call:
     inputs:
-      pytorch:
+      runner:
         required: true
+        type: string
+        default: 'pvc_rolling'
+        description: Runner label
+      test_type:
+        type: string
+        default: 'build-from-source'
+        description: Build from source or install nightly wheel
+      pytorch:
         type: string
         default: 'main'
-        description: Pytorch branch/commit
-      keep_torch_xpu_ops:
-        required: false
+        description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      torch_xpu_ops:
         type: string
-        default: 'false'
-        description: Keep torch-xpu-ops pin. `true` means use pined commit
-      driver:
+        default: 'main'
+        description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
+      triton:
         required: false
         type: string
-        default: 'lts'
-        description: Driver lts/rolling
+        default: 'pinned'
+        description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      oneapi:
+        type: string
+        default: 'installed'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
       python:
-        required: false
         type: string
         default: '3.10'
         description: Python version
-      runner:
-        required: true
-        type: string
-        default: 'linux.idc.xpu'
-        description: Runner label
-      triton:
-        required: false
-        type: string
-        default: ''
-        description: Triton commit. Use pytorch pined commit by default
-    outputs:
-      torch_commit_id:
-        description: The commit id of the torch build
-        value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }}
 
 permissions: read-all
 
+defaults:
+  run:
+    shell: bash -xe {0}
+
 jobs:
-  build:
+  runner:
     runs-on: ${{ inputs.runner }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+    steps:
+      - name: Cleanup workspace
+        run: |
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  build:
+    name: ${{ inputs.pytorch }}
+    needs: runner
+    if: ${{ ! contains(inputs.test_type, 'wheel') }}
+    runs-on: ${{ needs.runner.outputs.runner_id }}
     container:
       image: 'pytorch/manylinux2_28-builder:xpu-main'
       volumes:
         - ${{ github.workspace }}:${{ github.workspace }}
       env:
-        PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-        commit_issue: 1280
-        GH_TOKEN: ${{ github.token }}
-        NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-        DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-    outputs:
-      TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }}
+        PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache
+    env:
+      GH_TOKEN: ${{ github.token }}
     timeout-minutes: 300
     steps:
-      - name: Setup based env
+      - name: Install gh-cli
         run: |
+          cat /etc/os-release
+          hostname && id
           # Cleanup workspace
-          rm -rf ${{ github.workspace }}/*
-          # Install gh
-          dnf install 'dnf-command(config-manager)'
+          find ./ |grep -v "^\./$" |xargs rm -rf
+          # install gh
+          dnf install -y 'dnf-command(config-manager)'
           dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
-          dnf autoremove -y git236* && dnf install -y git
-          dnf install gh --repo gh-cli -y
-          # Setup python
+          dnf install -y gh --repo gh-cli
+          gh --version
+      - name: Setup python-${{ inputs.python }}
+        run: |
+          rm -rf /tmp/xpu-tool/myvenv
           local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}')
-          /opt/python/${local_python}/bin/python -m venv /opt/xpu-build
-          which python && python -V && pip list
+          /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv
+          which python && python -V
+          which pip && pip list
           pip install -U pip wheel setuptools
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
         with:
           path: torch-xpu-ops
-      - name: Build Triton XPU
+      - name: Build Pytorch on ${{ needs.runner.outputs.hostname }}
         run: |
-          # gcc 13
-          dnf install -y gcc-toolset-13-gcc-c++
-          source /opt/rh/gcc-toolset-13/enable
-          dnf install -y zlib-devel
-          cd ../ && rm -rf pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch
-          if [ -z ${{ inputs.triton }} ]; then
-            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
-          else
-            TRITON_COMMIT_ID="${{ inputs.triton }}"
-          fi
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install cmake ninja pybind11
-            rm -rf pytorch_triton_xpu-*.whl
-            TRITON_VERSION_NAME="$(
-              curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
-                      grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
-            )"
-            python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME}
-            cp pytorch_triton_xpu-*.whl ${{ github.workspace }}
-          fi
-      - name: Build Pytorch XPU
-        run: |
-          set -xe -o pipefail
-          if [ "${{ inputs.driver }}" == "lts" ]; then
-              export TORCH_XPU_ARCH_LIST='pvc'
+          export USE_XCCL=1
+          # only build pvc for CI
+          if [ "${{ inputs.test_type }}" == "build-cicd" ];then
+            export TORCH_XPU_ARCH_LIST='pvc'
           fi
           if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then
             PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
-            PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')"
+            PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')"
           else
             PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
-            PYTORCH_VERSION="${{ inputs.pytorch }}"
+            PYTORCH_COMMIT="${{ inputs.pytorch }}"
           fi
-          if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then
-            TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')"
-            TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')"
-          elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then
-            TORCH_XPU_OPS_VERSION="pinned"
+          if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then
+            TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')"
+            TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')"
           else
-            TORCH_XPU_OPS_VERSION="cicd"
+            TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
+            TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
           fi
-          # oneAPI DLE
-          source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
           # gcc 11
           source /opt/rh/gcc-toolset-11/enable
-          export USE_XCCL=1
+          # oneAPI DLE
+          if [ "${{ inputs.oneapi }}" != "installed" ];then
+            rm -rf ${HOME}/intel ${HOME}/.intel /opt/intel
+            wget -q -O oneapi.sh "${{ inputs.oneapi }}"
+            bash oneapi.sh -a -s --eula accept --action install --install-dir /opt/intel/oneapi
+            export XPU_ONEAPI_PATH="/opt/intel/oneapi"
+          fi
+          source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
           ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
             --WORKSPACE="${{ github.workspace }}" \
             --PYTORCH_REPO="${PYTORCH_REPO}" \
-            --PYTORCH_VERSION="${PYTORCH_VERSION}" \
+            --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \
             --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \
-            --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \
-            2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_VERSION//\//-}.log
+            --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \
+            2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log
+          if [ $(ls ${{ github.workspace }} |grep -c "torch-.*.whl") -eq 0 ];then
+            echo "Build pytorch got failed"
+            exit 1
+          fi
+      - name: Build Triton
+        run: |
+          # gcc 13
+          dnf install -y gcc-toolset-13-gcc-c++ zlib-devel
+          source /opt/rh/gcc-toolset-13/enable
+          cd ./pytorch
+          pip install cmake ninja pybind11
+          rm -rf pytorch_triton_xpu-*.whl
+          if [ "${{ inputs.triton }}" != "pinned" ];then
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          else
+            TRITON_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)"
+          fi
+          TRITON_VERSION_NAME="$(
+            curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
+                    grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
+          )"
+          python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \
+            2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log
+          if [ $(ls |grep -c "pytorch_triton_xpu-.*.whl") -eq 0 ];then
+            echo "Build triton got failed"
+            exit 1
+          fi
+          pip install pytorch_triton_xpu-*.whl
+          cp pytorch_triton_xpu-*.whl ${{ github.workspace }}
+      - name: Build Torchvision and Torchaudio
+        run: |
+          # gcc 13
+          dnf install -y gcc-toolset-13-gcc-c++ zlib-devel
+          source /opt/rh/gcc-toolset-13/enable
+          cd ./pytorch
+          TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)"
+          TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)"
+          git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision
+          cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID}
+          python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log
+          if [ $(ls dist/ |grep -c "torchvision-.*.whl") -eq 0 ];then
+            echo "Build torchvision got failed"
+            exit 1
+          fi
+          pip install dist/*.whl
+          cp dist/*.whl ${{ github.workspace }}
+          git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio
+          cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID}
+          python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log
+          if [ $(ls dist/ |grep -c "torchaudio-.*.whl") -eq 0 ];then
+            echo "Build torchaudio got failed"
+            exit 1
+          fi
+          pip install dist/*.whl
+          cp dist/*.whl ${{ github.workspace }}
       - name: Torch Config
         run: |
+          printenv
           python -c "import torch; print(torch.__config__.show())"
           python -c "import torch; print(torch.__config__.parallel_info())"
           python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+          python -c "import torchvision; print(torchvision.__version__)"
+          python -c "import torchaudio; print(torchaudio.__version__)"
           python pytorch/torch/utils/collect_env.py
-      - name: Identify Build version
-        id: build_version
-        run: |
-          echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}"
+          pip list |grep -E 'torch|intel'
+          chmod 777 /__w -R
       - name: Upload Torch XPU Wheel
-        if: ${{ ! cancelled() }}
+        if: ${{ success() }}
         uses: actions/upload-artifact@v4
         with:
           name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/torch*.whl
-      - name: Upload Triton Wheel
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Triton-Wheel-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/pytorch_triton_xpu-*.whl
+          path: ${{ github.workspace }}/*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/pytorch_*.log
-      - name: Cleanup
-        if: always()
-        run: |
-          chmod 777 . -R
-          rm -rf pytorch torch-xpu-ops pytorch_*.log torch*.whl pytorch_triton_xpu-*.whl
+          path: ${{ github.workspace }}/build_*.log
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
new file mode 100644
index 0000000000..9abe81cacb
--- /dev/null
+++ b/.github/workflows/_linux_e2e.yml
@@ -0,0 +1,254 @@
+name: Linux E2E Test
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+        default: 'pvc_rolling'
+        description: Runner label
+      test_type:
+        type: string
+        default: 'build-from-source'
+        description: Build from source or install nightly wheel
+      pytorch:
+        type: string
+        default: 'main'
+        description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      oneapi:
+        type: string
+        default: 'installed'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
+      python:
+        type: string
+        default: '3.10'
+        description: Python version
+      suite:
+        type: string
+        default: 'huggingface'
+        description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
+      dt:
+        type: string
+        default: 'float32'
+        description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
+      mode:
+        type: string
+        default: 'inference'
+        description: Test mode. `inference,training`. Delimiter is comma
+      scenario:
+        type: string
+        default: 'accuracy'
+        description: Test scenario. `accuracy,performance`. Delimiter is comma
+      model:
+        required: false
+        type: string
+        default: ''
+        description: Model. Will only run this one mode if set
+
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash -xe {0}
+
+jobs:
+  runner:
+    runs-on: ${{ inputs.runner }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+    steps:
+      - name: Cleanup workspace
+        run: |
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  test:
+    runs-on: ${{ needs.runner.outputs.runner_id }}
+    needs: runner
+    timeout-minutes: 3600
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
+      env:
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        TORCH_HOME: /tmp/.cache/_torch
+        HF_HOME: /tmp/.cache/_huggingface
+        MODEL_ONLY_NAME: ${{ inputs.model }}
+    env:
+      GH_TOKEN: ${{ github.token }}
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    steps:
+      - name: Cleanup workspace
+        run: |
+          find ./ |grep -v "^\./$" |xargs rm -rf
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Launch Test on ${{ needs.runner.outputs.hostname }}
+        uses: ./.github/actions/linux-testenv
+        with:
+          test_type: ${{ inputs.test_type }}
+          pytorch: ${{ inputs.pytorch }}
+          torch_xpu_ops: skipped
+          oneapi: ${{ inputs.oneapi }}
+          python: ${{ inputs.python }}
+          suite: ${{ inputs.suite }}
+
+      # CICD launch
+      - name: CICD Huggingface BF16 & FP16 Training Test
+        if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'huggingface') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: huggingface
+          dt: bfloat16,float16
+          mode: training
+          scenario: accuracy,performance
+      - name: CICD Timm_models BF16 Training Test
+        if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'timm_models') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: timm_models
+          dt: bfloat16
+          mode: training
+          scenario: accuracy,performance
+      - name: CICD Torchbench BF16 Training Test
+        if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'torchbench') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: torchbench
+          dt: bfloat16
+          mode: training
+          scenario: accuracy,performance
+
+      # Nihglty launch
+      - name: Nightly Huggingface Full Test
+        if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'huggingface') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: huggingface
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+      - name: Nightly Timm_models FP16 Training Test
+        if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'timm_models') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: timm_models
+          dt: float16
+          mode: training
+          scenario: accuracy,performance
+      - name: Nightly Torchbench BF16 Training Test
+        if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'torchbench') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: torchbench
+          dt: bfloat16
+          mode: training
+          scenario: accuracy,performance
+      - name: Nightly PT2E Full Test
+        if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'pt2e') }}
+        uses: ./.github/actions/pt2e
+        with:
+          env_prepare: true
+          dt: float32,int8
+          scenario: accuracy,performance
+
+      # Weekly launch
+      - name: Weekly Huggingface Full Test
+        if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'huggingface') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: huggingface
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+      - name: Weekly Timm_models Full Test
+        if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'timm_models') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: timm_models
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+      - name: Weekly Torchbench Full Test
+        if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'torchbench') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: torchbench
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+      - name: Weekly PT2E Full Test
+        if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'pt2e') }}
+        uses: ./.github/actions/pt2e
+        with:
+          env_prepare: true
+          dt: float32,int8
+          scenario: accuracy,performance
+
+      # On-demand launch
+      - name: OnDemand Test (huggingface)
+        if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'huggingface') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: huggingface
+          dt: ${{ inputs.dt }}
+          mode: ${{ inputs.mode }}
+          scenario: ${{ inputs.scenario }}
+      - name: OnDemand Test (timm_models)
+        if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'timm_models') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: timm_models
+          dt: ${{ inputs.dt }}
+          mode: ${{ inputs.mode }}
+          scenario: ${{ inputs.scenario }}
+      - name: OnDemand Test (torchbench)
+        if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'torchbench') }}
+        uses: ./.github/actions/linux-e2etest
+        with:
+          env_prepare: true
+          suite: torchbench
+          dt: ${{ inputs.dt }}
+          mode: ${{ inputs.mode }}
+          scenario: ${{ inputs.scenario }}
+      - name: OnDemand PT2E Test (pt2e)
+        if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }}
+        uses: ./.github/actions/pt2e
+        with:
+          env_prepare: true
+          dt: ${{ inputs.dt }}
+          scenario: ${{ inputs.scenario }}
+
+      - name: Get archieve files
+        if: ${{ ! cancelled() }}
+        run: |
+          rm -rf ${{ github.workspace }}/upload_files
+          cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files
+      - name: Upload Inductor XPU E2E Data
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.suite }}
+          path: ${{ github.workspace }}/upload_files
diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml
new file mode 100644
index 0000000000..746bc1d565
--- /dev/null
+++ b/.github/workflows/_linux_e2e_summary.yml
@@ -0,0 +1,98 @@
+name: Linux E2E Test
+
+on:
+  workflow_call:
+    inputs:
+      test_type:
+        type: string
+        default: 'build-from-source'
+        description: Build from source or install nightly wheel
+      python:
+        type: string
+        default: '3.10'
+        description: Python version
+
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash -xe {0}
+
+jobs:
+  summary:
+    runs-on: ubuntu-latest
+    if: ${{ ! cancelled() }}
+    permissions:
+      issues: write
+    env:
+      GH_TOKEN: ${{ github.token }}
+      REFERENCE_ISSUE_ID: 1645
+      AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+    steps:
+      - name: Install gh-cli
+        run: |
+          sudo apt-get update
+          sudo apt-get install gh rsync ca-certificates -y
+          find ./ |grep -v "^\./$" |xargs rm -rf
+      - name: Setup python-${{ inputs.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ inputs.python }}
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Download Target Artifact
+        run: |
+          mkdir target/
+          cd target/
+          target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*"
+          gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -p "${target_dir}"
+          find Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*/ -maxdepth 1 -mindepth 1 -type d |\
+                  while read line; do mv $line .; done
+      - name: Download Baseline Artifact
+        run: |
+          mkdir baseline/
+          cd baseline/
+          artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/;s/cicd/weekly/')"
+          gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt
+          REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')"
+          if [ "${REFERENCE_RUN_ID}" != "" ];then
+            gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
+            find Inductor-*-XPU-E2E-*/ -maxdepth 1 -mindepth 1 -type d |while read line; do mv $line .; done
+          fi
+      - name: Get summary
+        if: ${{ ! cancelled() }}
+        run: |
+          pip install pandas requests
+          e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)"
+          if [ -f "${e2e_summary_csv}" ];then
+            bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY}
+            exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
+            if [ ${exit_label} -ne 0 ];then
+              grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
+              echo "There are ${exit_label} cases that need look into!!! Please check them"
+              exit ${exit_label}
+            fi
+          fi
+          pt2e_summary_csv="$(find ./target/ -name "summary.csv")"
+          if [ -f "${pt2e_summary_csv}" ];then
+            cat ${pt2e_summary_csv}
+            failed_num=$(grep -c ',failed' ${pt2e_summary_csv})
+            if [ ${failed_num} -ne 0 ];then
+              echo "[Warning] PT2E has failures!"
+            fi
+          fi
+      - name: Upload Reference Run ID
+        if: ${{ ! (contains(inputs.test_type, 'ondemand') || contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }}
+        run: |
+          gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1
+          has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)"
+          if [ ${has_or_not} -ne 0 ];then
+            sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt
+          else
+            echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt
+          fi
+          gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt
+      - name: Set permissions
+        if: ${{ always() }}
+        run: |
+          find ./ |grep -v "^\./$" |xargs rm -rf
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index 9760e6d960..2ab84d571a 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -3,112 +3,88 @@ name: Linux OP Benchmark Test
 on:
   workflow_call:
     inputs:
+      runner:
+        required: true
+        type: string
+        default: 'pvc_rolling'
+        description: Runner label
+      test_type:
+        type: string
+        default: 'build-from-source'
+        description: Build from source or install nightly wheel
       pytorch:
-        required: false
         type: string
         default: 'main'
-        description: Pytorch branch/commit
-      keep_torch_xpu_ops:
-        required: false
-        type: string
-        default: 'false'
-        description: Keep torch-xpu-ops pin. `true` means use pined commit
-      triton:
-        required: false
+        description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      oneapi:
         type: string
-        default: ''
-        description: Triton commit. Use pytorch pined commit by default
+        default: 'installed'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
       python:
-        required: false
         type: string
         default: '3.10'
         description: Python version
-      runner:
-        required: true
-        type: string
-        default: 'linux.idc.xpu'
-        description: Runner label
-      driver:
-        required: false
-        type: string
-        default: 'rolling'
-        description: Driver lts/rolling
 
-permissions: 
-  issues: write
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash -xe {0}
 
 jobs:
-  op_benchmark_test:
-    runs-on: ${{ inputs.runner }} 
+  runner:
+    runs-on: ${{ inputs.runner }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+    steps:
+      - name: Cleanup workspace
+        run: |
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  op_benchmark:
+    needs: runner
+    runs-on: ${{ needs.runner.outputs.runner_id }}
+    permissions: 
+      issues: write
     timeout-minutes: 900
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
+      env:
+        AGENT_TOOLSDIRECTORY: /opt/xpu-tool
+        TORCH_HOME: /tmp/.cache/_torch
+        HF_HOME: /tmp/.cache/_huggingface
+        REFERENCE_ISSUE: 1689
     env:
       GH_TOKEN: ${{ github.token }}
-      reference_issue: 1689
-      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
     steps:
+      - name: Cleanup workspace
+        run: |
+          find ./ |grep -v "^\./$" |xargs rm -rf
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
-          pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
-          fi
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
+      - name: Launch Test on ${{ needs.runner.outputs.hostname }}
+        uses: ./.github/actions/linux-testenv
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}
-      - name: Install Pytorch XPU
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd ../pytorch
-            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-            pip install -r requirements.txt
-            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-          fi
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Torch Config
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          python -c "import torch; print(torch.__config__.show())"
-          python -c "import torch; print(torch.__config__.parallel_info())"
-          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          test_type: ${{ inputs.test_type }}
+          pytorch: ${{ inputs.pytorch }}
+          torch_xpu_ops: skipped
+          oneapi: ${{ inputs.oneapi }}
+          python: ${{ inputs.python }}
 
-          cd ..
-          python pytorch/torch/utils/collect_env.py
-          rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache
       - name: Run Torch XPU Op Benchmark
-        if: ${{ inputs.driver == 'rolling' }} 
         run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
           mkdir -p ${{ github.workspace }}/op_benchmark
           cd test/microbench
           filename=$(find -- *.py)
@@ -129,12 +105,17 @@ jobs:
           path: ${{ github.workspace }}/op_benchmark
 
   op_benchmark_test_results_check:
-    needs: op_benchmark_test
-    runs-on: ubuntu-22.04
+    needs: op_benchmark
+    runs-on: ubuntu-latest
     env:
       GH_TOKEN: ${{ github.token }}
       reference_issue: 1689
     steps:
+      - name: Install gh-cli
+        run: |
+          sudo apt-get update
+          sudo apt-get install gh rsync ca-certificates -y
+          find ./ |grep -v "^\./$" |xargs rm -rf
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
       - name: Setup python-${{ inputs.python }}
@@ -174,7 +155,6 @@ jobs:
           python ${{ github.workspace }}/.github/scripts/op_perf_comparison.py --xpu_file ${{ github.workspace }}/op_benchmark/backward_op_summary.csv --baseline_file ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv
       - name: Update OP Baseline
         run: |
-          pip install tabulate pandas
           mkdir ${{ github.workspace }}/new_baseline
           cp ${{ github.workspace }}/baseline/baseline*.csv ${{ github.workspace }}/new_baseline
           # Update forward op
@@ -190,6 +170,6 @@ jobs:
           path: ${{ github.workspace }}/op_benchmark
       - name: Upload Reference Run ID
         run: |
-          gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \
+          gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} --json body -q .body | \
             sed "s/Inductor-XPU-OP-Benchmark-Data:.*/Inductor-XPU-OP-Benchmark-Data: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt
-          gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt
+          gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE} --body-file new_body.txt
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 07c83ea143..33d0b54d8d 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -3,493 +3,198 @@ name: Linux UT Test
 on:
   workflow_call:
     inputs:
-      pytorch:
-        required: false
+      runner:
+        required: true
         type: string
-        default: 'main'
-        description: Pytorch branch/commit
-      keep_torch_xpu_ops:
-        required: false
+        description: Runner label
+      test_type:
+        required: true
         type: string
-        default: 'false'
-        description: Keep torch-xpu-ops pin. `true` means use pined commit
-      triton:
-        required: false
+        description: Test scope
+      pytorch:
         type: string
-        default: ''
-        description: Triton commit. Use pytorch pined commit by default
-      ut:
-        required: true
+        default: 'main'
+        description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      torch_xpu_ops:
         type: string
-        default: ''
-        description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma
-      disabled_tests:
-        required: false
+        default: 'main'
+        description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
+      oneapi:
         type: string
-        default: ''
-        description: List disabled tests, such as disable_ut or disable_distributed
+        default: 'installed'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
       python:
-        required: false
         type: string
         default: '3.10'
         description: Python version
-      runner:
+      ut:
         required: true
         type: string
-        default: 'linux.idc.xpu'
-        description: Runner label
-      driver:
-        required: false
+        description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_torch,xpu_dev1` Delimiter is comma
+      disabled_tests:
         type: string
-        default: 'lts'
-        description: Driver lts/rolling
+        default: ''
+        description: List disabled tests, such as disable_ut or disable_distributed
 
 permissions: read-all
 
+defaults:
+  run:
+    shell: bash -xe {0}
+
 jobs:
-  ut_test:
-    runs-on: ${{ matrix.test.runner || inputs.runner }}
-    if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }}
+  runner:
+    runs-on: ${{ inputs.runner }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+    steps:
+      - name: Cleanup workspace
+        run: |
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  normal:
+    needs: runner
+    runs-on: ${{ needs.runner.outputs.runner_id }}
+    if: ${{ contains(inputs.ut, 'ut_') && !contains(inputs.disabled_tests, 'disable_ut') }}
     timeout-minutes: 300
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
+              -e ZE_AFFINITY_MASK
+      env:
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        TORCH_HOME: /tmp/.cache/_torch
+        HF_HOME: /tmp/.cache/_huggingface
     env:
       GH_TOKEN: ${{ github.token }}
-      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      ut_skip_issue: 1624
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     strategy:
       fail-fast: false
       matrix:
-        test:
-          - name: 'op_regression'
-            condition: ${{ contains(inputs.ut, 'op_regression') }}
-            directory: 'test/regressions'
-            command: 'pytest --timeout 600 --timeout_method=thread -v --junit-xml=../../ut_log/op_regression.xml'
-            log_prefix: 'op_regression'
-            additional_steps: |
-              clinfo --list
-              pip install pytest pytest-timeout
-          - name: 'op_regression_dev1'
-            condition: ${{ contains(inputs.ut, 'op_regression_dev1') }}
-            directory: 'test/regressions'
-            command: 'pytest --timeout 600 --timeout_method=thread -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml'
-            log_prefix: 'op_regression_dev1'
-            additional_steps: |
-              clinfo --list
-              unset ZE_AFFINITY_MASK
-              pip install pytest pytest-timeout
-            runner: 'pvc_e2e'
-          - name: 'op_transformers'
-            condition: ${{ contains(inputs.ut, 'op_transformers') }}
-            directory: '../pytorch'
-            command: 'pytest --timeout 600 --timeout_method=thread -v test/test_transformers.py -k xpu --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml'
-            log_prefix: 'op_transformers'
-            additional_steps: |
-              pip install pytest pytest-timeout
-              export PYTORCH_TEST_WITH_SLOW=1
-          - name: 'op_extended'
-            condition: ${{ contains(inputs.ut, 'op_extended') }}
-            directory: '../pytorch/third_party/torch-xpu-ops/test/xpu/extended/'
-            command: 'python run_test_with_skip.py'
-            log_prefix: 'op_extended'
-            additional_steps: |
-              pip install pytest pytest-timeout
-              export PYTORCH_TEST_WITH_SLOW=1
-            xml_post_processing: |
-              cp op_extended.xml $GITHUB_WORKSPACE/ut_log
-          - name: 'op_ut'
-            condition: ${{ contains(inputs.ut, 'op_ut') }}
-            directory: '../pytorch/third_party/torch-xpu-ops/test/xpu'
-            log_prefix: 'op_ut'
-            command_script: |
-              export PYTORCH_ENABLE_XPU_FALLBACK=1
-              export PYTORCH_TEST_WITH_SLOW=1
-              python run_test_with_skip.py \
-                2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \
-                tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log
-              cp *.xml $GITHUB_WORKSPACE/ut_log
-              find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c '
-                  dir_path=$(dirname "$1");
-                  case "$dir_path" in
-                      *"op_ut_with_skip_quantization/core"*)
-                          dir_name="op_ut_with_skip_quantization_core";;
-                      *)
-                          dir_name=$(basename "$dir_path");;
-                  esac;
-                  mv "$1" "$dir_path/${dir_name}_$(basename "$1")"
-              ' _ {} \;
-              cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log
-              cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log
-              # Cases run with a on-demand white list, since some suites are too
-              # slow to go through all operators on CPU. So add cases on-demand
-              # when XPU implementatoin is done.
-              # test_foreach, test_decomp
-              # Run with only
-              timeout 10000 python run_test_with_only.py \
-                2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \
-                tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log
-              cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log
-            additional_steps: |
-              pip install pytest pytest-timeout
-          - name: 'torch_xpu'
-            condition: ${{ contains(inputs.ut, 'torch_xpu') }}
-            directory: '../pytorch'
-            command_script: |
-              export PYTORCH_TEST_WITH_SLOW=1
-              export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
-              test_cmd="python test/run_test.py --include "
-              for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done
-              for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done
-              if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi
-              eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \
-                tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log
-            log_prefix: 'torch_xpu'
-            additional_steps: |
-              pip install pytest pytest-timeout
-          - name: 'xpu_profiling'
-            condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }}
-            directory: '$GITHUB_WORKSPACE'
-            command_script: |
-              mkdir -p $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce
-              # RN50 Test
-              PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0
-              cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/xpu_profiling
-
-              # All Issue Reproduce UT
-              python -u test/profiling/correlation_id_mixed.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log
-              python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log
-              python -u test/profiling/time_precision_in_profile.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log
-              python -u test/profiling/profile_partial_runtime_ops.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log
-              python -u test/profiling/triton_xpu_ops_time.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log
-              
-              # llama case for calls number test
-              python test/profiling/llama.py | \
-                tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log
-              python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
-              bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
-              
-              # All xpu ut under test/profiler
-              cd ../pytorch/test/profiler
-              python -m pytest --timeout 600 -vs test_cpp_thread.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_cpp_thread.log
-              python -m pytest --timeout 600 -vs test_execution_trace.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_execution_trace.log
-              python -m pytest --timeout 600 -vs test_memory_profiler.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_memory_profiler.log
-              python -m pytest --timeout 600 -vs test_profiler_tree.py | \
-                tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_profiler_tree.log
-            additional_steps: |
-              pip install pytest pytest-timeout transformers
-    outputs: 
-      ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }}
+        test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling]
     steps:
+      - name: Cleanup workspace
+        if: ${{ contains(inputs.ut, matrix.test) }}
+        run: |
+          find ./ |grep -v "^\./$" |xargs rm -rf
       - name: Checkout torch-xpu-ops
+        if: ${{ contains(inputs.ut, matrix.test) }}
         uses: actions/checkout@v4
-      - name: Create unique workspace
-        shell: bash -xe {0}
-        run: |
-          # Create unique conda env for each UT test
-          random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
-          echo "CONDA_ENV_NAME=xpu_op_${ZE_AFFINITY_MASK}_${{ matrix.test.name }}_${random}" >> $GITHUB_ENV
-      - name: Create Conda Env
-        shell: bash -xe {0}
-        run: |
-          pwd
-          which conda
-          conda remove --all -y -n $CONDA_ENV_NAME || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          conda create -n $CONDA_ENV_NAME python=${{ inputs.python }} cmake ninja -y
-          source activate $CONDA_ENV_NAME
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
+      - name: Launch Test on ${{ needs.runner.outputs.hostname }}
+        if: ${{ contains(inputs.ut, matrix.test) }}
+        uses: ./.github/actions/linux-testenv
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-      - name: Prepare Stock Pytorch
-        shell: bash -xe {0}
-        run: |
-          cd ../
-          rm -rf ./pytorch || sudo rm -rf ./pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          source activate $CONDA_ENV_NAME
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ./pytorch
-            git checkout ${TORCH_COMMIT_ID}
-            rm -rf vision || sudo rm -rf vision
-            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ./pytorch
-            git checkout ${TORCH_COMMIT_ID}
-          fi
-          pip install requests
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git show -s && git status && git diff
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Prepare Torch-xpu-ops
-        shell: bash -xe {0}
-        run: |
-          cd ../pytorch
-          rm -rf third_party/torch-xpu-ops
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cp -r ${{ github.workspace }} third_party
-          else
-            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
-            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
-            cd third_party/torch-xpu-ops
-            git checkout ${TORCH_XPU_OPS_COMMIT}
-          fi
-      - name: Download Triton wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
-        with:
-          name: Triton-Wheel-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}
-      - name: Install Triton
-        shell: bash -xe {0}
-        run: |
-          source activate $CONDA_ENV_NAME
-          pip install --force-reinstall ${{ github.workspace }}/pytorch_triton_xpu-*.whl
-      - name: Torch Config
-        shell: bash -xe {0}
-        run: |
-          source activate $CONDA_ENV_NAME
-          python -c "import torch; print(torch.__config__.show())"
-          python -c "import torch; print(torch.__config__.parallel_info())"
-          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-          python -c "import triton; print(triton.__version__)"
-
-          cd ..
-          python pytorch/torch/utils/collect_env.py
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
-          echo "UT_NAME=${{ matrix.test.name }}" >> "${GITHUB_ENV}"
+          test_type: ${{ inputs.test_type }}
+          pytorch: ${{ inputs.pytorch }}
+          torch_xpu_ops: ${{ inputs.torch_xpu_ops }}
+          oneapi: ${{ inputs.oneapi }}
+          python: ${{ inputs.python }}
       - name: Run XPU UT Test
-        shell: bash -xe {0}
-        if: ${{ matrix.test.condition }}
-        run: |
-          set -e
-          mkdir -p ${{ github.workspace }}/ut_log
-          mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }}
-          source activate $CONDA_ENV_NAME
-          echo "Running ${{ matrix.test.name }}"
-          echo "Directory: ${{ matrix.test.directory }}"
-          ${{ matrix.test.additional_steps }}
-
-          cd ${{ matrix.test.directory }}
-
-          if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then
-            bash << "SCRIPT"
-            set -e
-            ${{ matrix.test.command_script }}
-          SCRIPT
-          else
-            ${{ matrix.test.command }} \
-              2>${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test_error.log | \
-              tee ${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test.log
-            ${{ matrix.test.xml_post_processing || '' }}
-          fi
+        if: ${{ contains(inputs.ut, matrix.test) }}
+        uses: ./.github/actions/linux-uttest
+        with:
+          test_type: ${{ matrix.test }}
       - name: UT Test Results Summary
-        shell: bash -xe {0}
-        if: ${{ matrix.test.condition }}
+        if: ${{ contains(inputs.ut, matrix.test) }}
         run: |
-          source activate $CONDA_ENV_NAME
           pip install junitparser
-          python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
-          if [ -e "ut_failure_list.csv" ];then
-              cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv
-          fi
-      - name: Clean up
-        if: ${{ always() }}
-        run: |
-          if [ -n "$CONDA_ENV_NAME" ]; then
-            conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
+          python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true
+          if [ -e ut_failure_list.csv ];then
+              cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true
           fi
       - name: Upload Inductor XPU UT Log
-        if: ${{ matrix.test.condition }}
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }}
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}
           path: ${{ github.workspace }}/ut_log
+          if-no-files-found: ignore
       - name: Upload XPU UT Failure list
-        if: ${{ matrix.test.condition }}
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }}
+          name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}
           path: ${{ github.workspace }}/ut_log/ut_failure_list.csv
-      - name: Set UT outputs
-        id: set-output
-        if: ${{ matrix.test.condition }}
-        run: |
-          echo "UT_NAME=${{ matrix.test.name }}" >> $GITHUB_OUTPUT
-  
-  ut_test_results_check:
-    needs: ut_test
-    runs-on: ubuntu-22.04
+          if-no-files-found: ignore
+
+  devices:
+    runs-on: pvc_rolling
+    if: ${{ contains(inputs.ut, 'xpu_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }}
     timeout-minutes: 30
     env:
       GH_TOKEN: ${{ github.token }}
-      ut_skip_issue: 1624
-    strategy:
-      fail-fast: false
-      matrix:
-        test: 
-        - name: 'op_regression'
-          condition: ${{ contains(inputs.ut, 'op_regression') }}
-        - name: 'op_regression_dev1'
-          condition: ${{ contains(inputs.ut, 'op_regression_dev1') }}
-        - name: 'op_transformers'
-          condition: ${{ contains(inputs.ut, 'op_transformers') }}
-        - name: 'op_extended'
-          condition: ${{ contains(inputs.ut, 'op_extended') }}
-        - name: 'op_ut'
-          condition: ${{ contains(inputs.ut, 'op_ut') }}
-        - name: 'torch_xpu'
-          condition: ${{ contains(inputs.ut, 'torch_xpu') }}
-        - name: 'xpu_profiling'
-          condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }}
+      AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool
     steps:
-      - name: Get matrix UT value
+      - name: Cleanup workspace
+        id: cleanup
         run: |
-          echo "UT_NAME=${{ needs.ut_test.outputs.ut_name }}" >> "${GITHUB_ENV}"
+          cat /etc/os-release
+          echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
+          ls -al
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+          sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf
+          sudo rm -rf ~/.triton ~/.torch
+          xpu-smi discovery
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Download XPU UT Logs
-        if: ${{ matrix.test.condition }}
-        uses: actions/download-artifact@v4
+      - name: Launch Test on ${{ steps.cleanup.outputs.hostname }}
+        uses: ./.github/actions/linux-testenv
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}
-          path: ${{ github.workspace }}/ut_log
-      - name: Check UT Results
-        if: ${{ matrix.test.condition }}
-        shell: bash
-        run: |
-          repo="${{ github.repository }}"
-          function contains() {
-              contains_status="echo 'Start $2 ...'"
-              {
-                [[ $1 =~ (^|,)$2($|,) ]]
-              } || {
-                echo "[Warning] $2 is not suppotted type! Skipped!"
-                contains_status="continue"
-              }
-          }
-          set -xe
-          cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }}
-          gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log
-          gh api "repos/${{ github.repository }}/issues?labels=skipped" \
-          --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \
-          > issues.log
-          awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log
-          awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log
-          cat issues_temp.log | awk '{print $1}' >> Known_issue.log
-          awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log
-          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
-          bash ut_result_check.sh ${{ matrix.test.name }}
+          test_type: ${{ inputs.test_type }}
+          pytorch: ${{ inputs.pytorch }}
+          torch_xpu_ops: ${{ inputs.torch_xpu_ops }}
+          oneapi: ${{ inputs.oneapi }}
+          python: ${{ inputs.python }}
+      - name: Run XPU UT Test
+        uses: ./.github/actions/linux-uttest
+        with:
+          test_type: xpu_dev1
       - name: Upload Inductor XPU UT Log
-        if: ${{ matrix.test.condition }}
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_dev1
           path: ${{ github.workspace }}/ut_log
 
-  distributed_ut_test:
+  distributed:
     runs-on: pytorch-06
     if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }}
     timeout-minutes: 60
     env:
       GH_TOKEN: ${{ github.token }}
-      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      ut_skip_issue: 1624
+      AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool
     steps:
+      - name: Cleanup workspace
+        id: cleanup
+        run: |
+          cat /etc/os-release
+          echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
+          ls -al
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+          sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf
+          sudo rm -rf ~/.triton ~/.torch
+          xpu-smi topology -m
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Create Conda Env
-        run: |
-          pwd
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
+      - name: Launch Test on ${{ steps.cleanup.outputs.hostname }}
+        uses: ./.github/actions/linux-testenv
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-      - name: Prepare Stock Pytorch
-        run: |
-          cd ../
-          rm -rf ./pytorch || sudo rm -rf ./pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ./pytorch
-            git checkout ${TORCH_COMMIT_ID}
-            rm -rf vision || sudo rm -rf vision
-            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ./pytorch
-            git checkout ${TORCH_COMMIT_ID}
-          fi
-          pip install requests
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git show -s && git status && git diff
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Prepare Torch-xpu-ops
-        run: |
-          cd ../pytorch
-          rm -rf third_party/torch-xpu-ops
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cp -r ${{ github.workspace }} third_party
-          else
-            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
-            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
-            cd third_party/torch-xpu-ops
-            git checkout ${TORCH_XPU_OPS_COMMIT}
-          fi
-      - name: Triton Installation
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          if [ -z ${{ inputs.triton }} ]; then
-            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
-          else
-            TRITON_COMMIT_ID="${{ inputs.triton }}"
-          fi
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install cmake ninja pybind11
-            rm -rf pytorch_triton_xpu-*.whl
-            TRITON_VERSION_NAME="$(
-              curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
-                      grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
-            )"
-            python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME}
-            pip install pytorch_triton_xpu-*.whl
-          fi
-      - name: Torch Config
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          python -c "import torch; print(torch.__config__.show())"
-          python -c "import torch; print(torch.__config__.parallel_info())"
-          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-          python -c "import triton; print(triton.__version__)"
-          cd ..
-          python pytorch/torch/utils/collect_env.py
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
+          test_type: ${{ inputs.test_type }}
+          pytorch: ${{ inputs.pytorch }}
+          torch_xpu_ops: ${{ inputs.torch_xpu_ops }}
+          oneapi: ${{ inputs.oneapi }}
+          python: ${{ inputs.python }}
       - name: Set Ptrace_scope
         if: ${{ always() }}
         run: |
@@ -499,20 +204,9 @@ jobs:
           cat ptrace_scope.bk
           echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope
       - name: Run Torch XPU Distributed UT
-        run: |
-          set -x -e -o pipefail
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          pip install pytest pytest-timeout
-          mkdir -p ut_log/xpu_distributed
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
-          if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then
-            echo -e "[ERROR] XCCL is not enabled"
-            exit 1
-          fi
-          timeout 1800 python run_distributed.py \
-            2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
-            tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
+        uses: ./.github/actions/linux-uttest
+        with:
+          test_type: xpu_distributed
       - name: Reset Ptrace_scope
         if: ${{ always() }}
         run: |
@@ -526,53 +220,52 @@ jobs:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
 
-  distributed_ut_test_results_check:
-    needs: distributed_ut_test
-    runs-on: ubuntu-22.04
+  summary:
+    needs: [normal, devices, distributed]
+    if: ${{ ! cancelled() }}
+    runs-on: ubuntu-latest
     timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling, xpu_dev1, xpu_distributed]
     env:
       GH_TOKEN: ${{ github.token }}
-      ut_skip_issue: 1624
+      UT_SKIP_ISSUE: 1624
     steps:
-      - name: Set the UT name
+      - name: Cleanup workspace
+        if: ${{ contains(inputs.ut, matrix.test) }}
         run: |
-          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          find ./ |grep -v "^\./$" |xargs rm -rf
       - name: Checkout torch-xpu-ops
+        if: ${{ contains(inputs.ut, matrix.test) }}
         uses: actions/checkout@v4
       - name: Download XPU UT Logs
+        if: ${{ contains(inputs.ut, matrix.test) }}
         uses: actions/download-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}
           path: ${{ github.workspace }}/ut_log
       - name: Check UT Results
+        if: ${{ contains(inputs.ut, matrix.test) }}
         shell: bash
         run: |
           repo="${{ github.repository }}"
-          function contains() {
-              contains_status="echo 'Start $2 ...'"
-              {
-                [[ $1 =~ (^|,)$2($|,) ]]
-              } || {
-                echo "[Warning] $2 is not suppotted type! Skipped!"
-                contains_status="continue"
-              }
-          }
-          set -xe
-          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          cd ${{ github.workspace }}/ut_log/xpu_distributed
-          gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log
+          ls -al ${{ github.workspace }}/ut_log
+          cd ${{ github.workspace }}/ut_log/${{ matrix.test }}
+          gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log
           gh api "repos/${{ github.repository }}/issues?labels=skipped" \
-          --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \
-          > issues.log
-          awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log
-          awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log
+            --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log
+          awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | \
+            grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log
+          awk '$2 == "ut_op" {print $1}' issues_temp.log > issues_ut_op.log
           cat issues_temp.log | awk '{print $1}' >> Known_issue.log
-          awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log
+          awk -F'::' '{print $1}' issues_ut_op.log | sort -u | paste -sd ',' >> Known_issue.log
           cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
-          bash ut_result_check.sh 'xpu_distributed'
+          bash ut_result_check.sh ${{ matrix.test }}
       - name: Upload Inductor XPU UT Log
-        if: always()
+        if: ${{ contains(inputs.ut, matrix.test) }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed-checked
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}-checked
           path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml
index ee628792f0..3c211ccfc2 100644
--- a/.github/workflows/_windows_ut.yml
+++ b/.github/workflows/_windows_ut.yml
@@ -8,7 +8,7 @@ on:
         type: string
         default: 'main'
         description: Pytorch branch/commit
-      keep_torch_xpu_ops:
+      torch_xpu_ops:
         required: false
         type: string
         default: 'false'
@@ -17,7 +17,7 @@ on:
         required: true
         type: string
         default: ''
-        description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma
+        description: UT scope. `ut_regression,xpu_dev1,ut_extended,ut_op,ut_torch` Delimiter is comma
       python:
         required: false
         type: string
@@ -89,7 +89,7 @@ jobs:
           git status 
           git show -s 
           git submodule sync && git submodule update --init --recursive
-          if ${{ inputs.keep_torch_xpu_ops }} == 'true' (
+          if ${{ inputs.torch_xpu_ops }} == 'pinned' (
             echo "Don't replace torch-xpu-ops!"
           ) else (
             echo "Replace torch-xpu-ops!"
@@ -157,7 +157,7 @@ jobs:
           path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\dist'
 
       - name: Run XPU OP Extended UT
-        if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule'
+        if: contains(inputs.ut, 'ut_extended') || github.event_name == 'schedule'
         shell: cmd
         run: |
           call "C:\ProgramData\miniforge3\Scripts\activate.bat"
@@ -169,7 +169,7 @@ jobs:
           python run_test_with_skip_mtl.py
 
       - name: Run Test XPU UT
-        if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule'
+        if: contains(inputs.ut, 'ut_torch') || github.event_name == 'schedule'
         shell: cmd
         run: |
           call "C:\ProgramData\miniforge3\Scripts\activate.bat"
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 25c3af0245..93826b34a8 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -3,442 +3,184 @@ name: Nightly-OnDemand Tests
 on:
   schedule:
     # GMT+8 21:00 every workday
-    - cron: '0 13 * * 0-4'
-    # GMT+8 0:00 Saturday
-    - cron: '0 16 * * 5'
+    - cron: '10 13 * * 0-4' # build from source
+    - cron: '30 13 * * 0-4' # nightly wheel
+    # GMT+8 00:00 Saturday
+    - cron: '10 16 * * 5' # build from source
+    - cron: '30 16 * * 5' # nightly wheel
   workflow_dispatch:
     inputs:
       pytorch:
-        required: false
         type: string
         default: 'main'
-        description: Pytorch branch/commit
-      keep_torch_xpu_ops:
-        required: false
+        description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      torch_xpu_ops:
         type: string
-        default: 'false'
-        description: Keep torch-xpu-ops pin. `true` means use pined commit
-      ut:
-        required: false
-        type: string
-        default: 'torch_xpu'
-        description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma
+        default: 'triggered'
+        description: Torch-xpu-ops workflow triggered branch by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
       triton:
-        required: false
+        type: string
+        default: 'pinned'
+        description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch'
+      oneapi:
+        type: string
+        default: 'installed'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
+      ut:
         type: string
         default: ''
-        description: Triton commit. Use pytorch pined commit by default
+        description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma
       suite:
-        required: true
-        type: string
-        default: 'huggingface'
-        description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
+        default: '[]'
+        description: Dynamo benchmarks test suite. `["huggingface","timm_models","torchbench","pt2e"]`. Delimiter is comma
       dt:
-        required: true
         type: string
-        default: 'float32'
+        default: ''
         description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
       mode:
-        required: true
         type: string
-        default: 'inference'
+        default: ''
         description: Test mode. `inference,training`. Delimiter is comma
       scenario:
-        required: true
         type: string
-        default: 'accuracy'
+        default: ''
         description: Test scenario. `accuracy,performance`. Delimiter is comma
       model:
-        required: false
         type: string
         default: ''
         description: Model. Will only run this one mode if set
-      python:
-        required: false
-        type: string
-        default: '3.10'
-        description: Python version
 
 permissions: read-all
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: ${{ github.event_name != 'schedule' }}
+run-name: ${{ (contains(github.event.schedule, '13') && 'Nightly') || (contains(github.event.schedule, '16') && 'Weekly') || 'On-demand' }} / ${{ (contains(github.event.schedule, '10') && 'Source Code') || (contains(github.event.schedule, '30') && 'CD Wheel') || inputs.pytorch }}
 
 jobs:
-  Linux-Nightly-Ondemand-Build:
+  Conditions-Filter:
+    name: conditions-filter
     if: ${{ github.repository_owner == 'intel' }}
-    name: linux-nightly-ondemand
+    runs-on: ubuntu-latest
+    timeout-minutes: 3
+    outputs:
+      test_type: ${{ steps.inputs-check.outputs.test_type }}
+      pytorch: ${{ steps.inputs-check.outputs.pytorch }}
+      torch_xpu_ops: ${{ steps.inputs-check.outputs.torch_xpu_ops }}
+    steps:
+      - name: Inputs check
+        id: inputs-check
+        run: |
+          if [ "${{ github.event_name }}" == "schedule" ];then
+            if [ "${{ github.event.schedule }}" == "10 13 * * 0-4" ];then
+              test_type="build-nightly"
+              pytorch="main"
+              torch_xpu_ops="main"
+            elif [ "${{ github.event.schedule }}" == "30 13 * * 0-4" ];then
+              test_type="wheel-nightly"
+              pytorch="nightly_wheel"
+              torch_xpu_ops="pinned"
+            elif [ "${{ github.event.schedule }}" == "10 16 * * 5" ];then
+              test_type="build-weekly"
+              pytorch="main"
+              torch_xpu_ops="main"
+            elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then
+              test_type="wheel-weekly"
+              pytorch="nightly_wheel"
+              torch_xpu_ops="pinned"
+            else
+              test_type="unknown"
+              pytorch="main"
+              torch_xpu_ops="main"
+            fi
+          else
+            pytorch="${{ inputs.pytorch }}"
+            torch_xpu_ops="${{ inputs.torch_xpu_ops }}"
+            if [[ "${{ inputs.pytorch }}" == *"_wheel" ]];then
+              test_type="wheel-ondemand"
+            else
+              test_type="build-ondemand"
+            fi
+          fi
+          echo "test_type=${test_type}" >> ${GITHUB_OUTPUT}
+          echo "pytorch=${pytorch}" >> ${GITHUB_OUTPUT}
+          echo "torch_xpu_ops=${torch_xpu_ops}" >> ${GITHUB_OUTPUT}
+
+  Linux-Nightly-Ondemand-Build:
+    needs: [Conditions-Filter]
+    name: linux-build
     secrets: inherit
     uses: ./.github/workflows/_linux_build.yml
     with:
-      pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      runner: pvc_e2e
+      runner: pvc_rolling
+      test_type: ${{ needs.Conditions-Filter.outputs.test_type }}
+      pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }}
+      triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }}
+      oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }}
 
   Linux-Nightly-Ondemand-UT-Tests:
-    if: ${{ github.event_name == 'schedule' || inputs.ut != '' }}
-    name: linux-nightly-ondemand
-    needs: Linux-Nightly-Ondemand-Build
+    if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'ut_') || contains(inputs.ut, 'xpu_') }}
+    name: linux-ut
+    needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }}
-      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
       runner: linux.idc.xpu
+      test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-ut
+      pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }}
+      oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }}
+      ut: ${{ github.event_name == 'schedule' && 'ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op' || inputs.ut }}
 
   Linux-Nightly-Ondemand-E2E-Tests:
-    runs-on: pvc_e2e
-    name: linux-nightly-ondemand / e2e_test
-    needs: Linux-Nightly-Ondemand-Build
-    timeout-minutes: 3600
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      reference_issue: 1645
-      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }}
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }}
-    outputs:
-      TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
-      TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
-      DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
-      KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
-      BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
-      OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
-      GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
-      TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
-      TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
-      TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
-      TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
-      TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
-      TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
-      TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Conda ENV
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
-          source activate e2e_ci
-          pip install pandas scipy psutil requests
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
-        with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../
-          rm -rf pytorch || sudo rm -rf pytorch
-          source activate e2e_ci
-          pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-          TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch
-          git checkout ${TORCH_COMMIT_ID}
-          # apply extra PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git diff && git show -s
-      - name: Identify pinned versions
-        id: pinned
-        run: |
-          source .github/scripts/env.sh
-          cd ../pytorch
-          if [ -z ${{ inputs.triton }} ]; then
-            echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          else
-            echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          fi
-          echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          . /etc/os-release
-          echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo ${GITHUB_ENV}
-      - name: Triton Installation
-        run: |
-          source activate e2e_ci
-          cd ../pytorch
-          pip install cmake ninja pybind11
-          rm -rf pytorch_triton_xpu-*.whl
-          TRITON_VERSION_NAME="$(
-            curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
-                    grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
-          )"
-          python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME}
-          pip install pytorch_triton_xpu-*.whl
-      - name: Show GITHUB_ENV
-        run: |
-          echo "$GITHUB_ENV"
-          rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
-
-      # Nihglty launch
-      - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          dt: float16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly PT2E Full Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          dt: float32,int8
-          scenario: accuracy,performance
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-      # Weekly launch
-      - name: Weekly Huggingface Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly Torchbench Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly Timm_models Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly PT2E Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: float32,int8
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-      # On-demand launch
-      - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: ${{ inputs.suite }}
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          mode: ${{ inputs.mode }}
-          scenario: ${{ inputs.scenario }}
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          scenario: ${{ inputs.scenario }}
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    if: ${{ github.event_name == 'schedule' || inputs.suite != '[]' }}
+    name: linux-e2e
+    needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
+    strategy:
+      fail-fast: false
+      matrix:
+        suite: ${{ fromJSON(inputs.suite) }}
+    uses: ./.github/workflows/_linux_e2e.yml
+    with:
+      runner: pvc_rolling
+      test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e
+      pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }}
+      oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }}
+      suite: ${{ matrix.suite }}
+      dt: ${{ github.event_name == 'schedule' && 'float32' || inputs.dt }}
+      mode: ${{ github.event_name == 'schedule' && 'inference' || inputs.mode }}
+      scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }}
+      model: ${{ github.event_name == 'schedule' && '' || inputs.model }}
+  Linux-Nightly-Ondemand-E2E-Tests-Summary:
+    if: ${{ ! cancelled() }}
+    name: linux-e2e
+    permissions: write-all
+    needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests]
+    uses: ./.github/workflows/_linux_e2e_summary.yml
+    with:
+      test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e
 
-      - name: Download Reference Artifact
-        id: reference_id
-        run: |
-          set -xe
-          source activate e2e_ci
-          conda install gh --channel conda-forge -y
-          if [ "${{ env.run_type }}" == "on-demand" ];then
-            artifact_type="weekly"
-          else
-            artifact_type="${{ env.run_type }}"
-          fi
-          REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
-            --json body -q .body |grep "Inductor-${artifact_type}-LTS-XPU-E2E" |sed 's/.*: *//')"
-          gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
-          rm -rf reference && mv Inductor-*-XPU-E2E-* reference
-      - name: Summarize archieve files
-        id: summary
-        if: ${{ ! cancelled() }}
-        run: |
-          set -x -e -o pipefail
-          rm -rf ${{ github.workspace }}/upload_files
-          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/
-          find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days
-          tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs
-          # Print summary
-          if [ "${{ inputs.suite }}" != 'pt2e' ];then
-            source activate e2e_ci
-            bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
-              ${{ github.workspace }}/upload_files \
-              ${{ github.workspace }}/reference \
-            >> ${GITHUB_STEP_SUMMARY}
-            exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
-            if [ ${exit_label} -ne 0 ];then
-              grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
-              echo "There are ${exit_label} cases that need look into!!! Please check them"
-              exit ${exit_label}
-            fi
-          fi
-          pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")"
-          if [ -f "${pt2e_summary_csv}" ];then
-            cat ${pt2e_summary_csv}
-            failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l)
-            if [ ${failed_num} -ne 0 ];then
-              echo "[Warning] PT2E has failures!"
-            fi
-          fi
-      - name: Upload Inductor XPU E2E Data
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-${{ env.run_type }}-LTS-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/upload_files
-      - name: Upload Reference Run ID
-        if: ${{ env.run_type != 'on-demand' }}
-        run: |
-          gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \
-            sed "s/Inductor-${{ env.run_type }}-LTS-XPU-E2E:.*/Inductor-${{ env.run_type }}-LTS-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt
-          gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt
+  Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling:
+    if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }}
+    name: linux-microbench
+    permissions: write-all
+    needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build]
+    uses: ./.github/workflows/_linux_op_benchmark.yml
+    with:
+      runner: pvc_rolling
+      test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-mb
+      pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }}
+      oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }}
 
   Windows-Nightly-Ondemand-UT-Tests:
-    if: ${{ github.event_name == 'schedule' || inputs.ut != '' }}
-    name: Windows-nightly-ondemand
+    if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }}
+    name: windows
+    needs: [Conditions-Filter]
     uses: ./.github/workflows/_windows_ut.yml
     with:
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
+      ut: ${{ github.event_name == 'schedule' && 'ut_extended,ut_torch' || inputs.ut }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }}
       src_changed: false
       has_label: true
       runner: Windows_CI
-
-  Tests-Failure-And-Report:
-    if: ${{ ! cancelled() }}
-    runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-    needs: Linux-Nightly-Ondemand-E2E-Tests
-    steps:
-      - name: Report github issue for XPU OPS nightly
-        if: github.repository_owner == 'intel'
-        run: |
-          set -xe
-          # Test env
-          build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          repo="${{ github.repository }}"
-          TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}"
-          TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}"
-          DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}"
-          KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.KERNEL_VERSION }}"
-          BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}"
-          OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}"
-          GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}"
-          TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
-          TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}"
-          TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
-          TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}"
-          TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}"
-          TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}"
-          TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMEOUT_MODELS }}"
-          # Test status
-          if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "success" ];then
-            test_status=Success
-          elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "failure" ];then
-            test_status=Failure
-            cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
-          else
-            test_status=None
-            exit 0
-          fi
-          # Test Type
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_type="On-demand"
-            test_issue_id=426
-            cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
-          elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then
-            test_type="Weekly"
-            test_issue_id=432
-          else
-            test_type="Nightly"
-            test_issue_id=432
-          fi
-          # Test report
-          echo -e "**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
-          printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION | $KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}"
-            if [ "${{ inputs.model }}" != "" ];then
-              test_scope+="; model=${{ inputs.model }}"
-            fi
-            echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
-          fi
-          echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt
-          echo "$cc_comment" >> ${{ github.workspace }}/report.txt
-          # Report
-          report_txt=$(cat ${{ github.workspace }}/report.txt)
-          gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
deleted file mode 100644
index 03101ebf3a..0000000000
--- a/.github/workflows/nightly_ondemand_rolling.yml
+++ /dev/null
@@ -1,460 +0,0 @@
-name: Nightly-OnDemand Tests Rolling
-
-on:
-  schedule:
-    # GMT+8 21:30 every workday
-    - cron: '30 13 * * 0-4'
-    # GMT+8 0:30 Saturday
-    - cron: '30 16 * * 5'
-  workflow_dispatch:
-    inputs:
-      pytorch:
-        required: false
-        type: string
-        default: 'main'
-        description: Pytorch branch/commit
-      keep_torch_xpu_ops:
-        required: false
-        type: string
-        default: 'false'
-        description: Keep torch-xpu-ops pin. `true` means use pined commit
-      ut:
-        required: false
-        type: string
-        default: 'torch_xpu'
-        description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma
-      triton:
-        required: false
-        type: string
-        default: ''
-        description: Triton commit. Use pytorch pined commit by default
-      suite:
-        required: true
-        type: string
-        default: 'huggingface'
-        description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
-      dt:
-        required: true
-        type: string
-        default: 'float32'
-        description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
-      mode:
-        required: true
-        type: string
-        default: 'inference'
-        description: Test mode. `inference,training`. Delimiter is comma
-      scenario:
-        required: true
-        type: string
-        default: 'accuracy'
-        description: Test scenario. `accuracy,performance`. Delimiter is comma
-      model:
-        required: false
-        type: string
-        default: ''
-        description: Model. Will only run this one mode if set
-      python:
-        required: false
-        type: string
-        default: '3.10'
-        description: Python version
-
-permissions: read-all
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: ${{ github.event_name != 'schedule' }}
-
-jobs:
-  Linux-Nightly-Ondemand-Build-Rolling:
-    if: ${{ github.repository_owner == 'intel' }}
-    name: linux-nightly-ondemand-rolling
-    secrets: inherit
-    uses: ./.github/workflows/_linux_build.yml
-    with:
-      pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      driver: rolling
-      runner: pvc_rolling
-
-  Linux-Nightly-Ondemand-UT-Tests-Rolling:
-    if: ${{ github.event_name == 'schedule' || inputs.ut != '' }}
-    name: linux-nightly-ondemand-rolling
-    needs: Linux-Nightly-Ondemand-Build-Rolling
-    uses: ./.github/workflows/_linux_ut.yml
-    with:
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }}
-      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
-      driver: rolling
-      runner: pvc_rolling
-  
-  Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling:
-    name: linux-nightly-ondemand-rolling / Op_microbench
-    permissions:
-      issues: write
-    needs: Linux-Nightly-Ondemand-Build-Rolling
-    uses: ./.github/workflows/_linux_op_benchmark.yml
-    with:
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
-      driver: rolling
-      runner: pvc_rolling
-
-  Linux-Nightly-Ondemand-E2E-Tests-Rolling:
-    runs-on: pvc_rolling
-    name: linux-nightly-ondemand-rolling / e2e_test
-    needs: Linux-Nightly-Ondemand-Build-Rolling
-    timeout-minutes: 3600
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      reference_issue: 1645
-      pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }}
-      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      NEOReadDebugKeys: 1
-      DisableScratchPages: 1
-      run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '30 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }}
-    outputs:
-      TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
-      TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
-      DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
-      KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
-      BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
-      OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
-      GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
-      TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
-      TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
-      TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
-      TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
-      TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
-      TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
-      TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Conda ENV
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
-          source activate e2e_ci
-          pip install pandas scipy psutil requests
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
-        with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../
-          rm -rf pytorch || sudo rm -rf pytorch
-          source activate e2e_ci
-          pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-          TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch
-          git checkout ${TORCH_COMMIT_ID}
-          # apply extra PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git diff && git show -s
-      - name: Identify pinned versions
-        id: pinned
-        run: |
-          source .github/scripts/env.sh
-          cd ../pytorch
-          if [ -z ${{ inputs.triton }} ]; then
-            echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          else
-            echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          fi
-          echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          . /etc/os-release
-          echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo ${GITHUB_ENV}
-      - name: Triton Installation
-        run: |
-          source activate e2e_ci
-          cd ../pytorch
-          pip install cmake ninja pybind11
-          rm -rf pytorch_triton_xpu-*.whl
-          TRITON_VERSION_NAME="$(
-            curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
-                    grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
-          )"
-          python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME}
-          pip install pytorch_triton_xpu-*.whl
-      - name: Show GITHUB_ENV
-        run: |
-          echo "$GITHUB_ENV"
-          rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
-
-      # Nihglty launch
-      - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16
-          mode: inference,training
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          dt: float16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Nightly PT2E Full Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          dt: float32,int8
-          scenario: accuracy,performance
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-
-      # Weekly launch
-      - name: Weekly Huggingface Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Weekly Torchbench Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Weekly Timm_models Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Weekly PT2E Accuracy Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: float32,int8
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-
-      # On-demand launch
-      - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: ${{ inputs.suite }}
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          mode: ${{ inputs.mode }}
-          scenario: ${{ inputs.scenario }}
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          scenario: ${{ inputs.scenario }}
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          driver: rolling
-      - name: Download Reference Artifact
-        id: reference_id
-        run: |
-          set -xe
-          source activate e2e_ci
-          conda install gh --channel conda-forge -y
-          if [ "${{ env.run_type }}" == "on-demand" ];then
-            artifact_type="weekly"
-          else
-            artifact_type="${{ env.run_type }}"
-          fi
-          REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
-            --json body -q .body |grep "Inductor-${artifact_type}-Rolling-XPU-E2E" |sed 's/.*: *//')"
-          gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
-          rm -rf reference && mv Inductor-*-XPU-E2E-* reference
-      - name: Summarize archieve files
-        id: summary
-        if: ${{ ! cancelled() }}
-        run: |
-          set -x -e -o pipefail
-          rm -rf ${{ github.workspace }}/upload_files
-          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/
-          find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days
-          tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs
-          # Print summary
-          if [ "${{ inputs.suite }}" != 'pt2e' ];then
-            source activate e2e_ci
-            export LTS_OR_ROLLING='rolling'
-            bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
-                ${{ github.workspace }}/upload_files \
-                ${{ github.workspace }}/reference \
-            >> ${GITHUB_STEP_SUMMARY}
-            exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
-            if [ ${exit_label} -ne 0 ];then
-              grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
-              echo "There are ${exit_label} cases that need look into!!! Please check them"
-              exit ${exit_label}
-            fi
-          fi
-          pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")"
-          if [ -f "${pt2e_summary_csv}" ];then
-            cat ${pt2e_summary_csv}
-            failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l)
-            if [ ${failed_num} -ne 0 ];then
-              echo "[Warning] PT2E has failures!"
-            fi
-          fi
-      - name: Upload Inductor XPU E2E Data
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-${{ env.run_type }}-Rolling-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/upload_files
-      - name: Upload Reference Run ID
-        if: ${{ env.run_type != 'on-demand' }}
-        run: |
-          gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \
-            sed "s/Inductor-${{ env.run_type }}-Rolling-XPU-E2E:.*/Inductor-${{ env.run_type }}-Rolling-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt
-          gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt
-
-  Tests-Failure-And-Report:
-    if: ${{ ! cancelled() }}
-    runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-    needs: Linux-Nightly-Ondemand-E2E-Tests-Rolling
-    steps:
-      - name: Report github issue for XPU OPS nightly
-        if: github.repository_owner == 'intel'
-        run: |
-          set -xe
-          # Test env
-          build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          repo="${{ github.repository }}"
-          TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_BRANCH_ID }}"
-          TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_COMMIT_ID }}"
-          KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.KERNEL_VERSION }}"
-          DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.DRIVER_VERSION }}"
-          BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.BUNDLE_VERSION }}"
-          OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.OS_PRETTY_NAME }}"
-          GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.GCC_VERSION }}"
-          TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHBENCH_COMMIT_ID }}"
-          TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHVISION_COMMIT_ID }}"
-          TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHAUDIO_COMMIT_ID }}"
-          TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRANSFORMERS_VERSION }}"
-          TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMM_COMMIT_ID }}"
-          TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRITON_COMMIT_ID }}"
-          TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMEOUT_MODELS }}"
-          # Test status
-          if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "success" ];then
-            test_status=Success
-          elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "failure" ];then
-            test_status=Failure
-            cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
-          else
-            test_status=None
-            exit 0
-          fi
-          # Test Type
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_type="On-demand"
-            test_issue_id=426
-            cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
-          elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then
-            test_type="Weekly"
-            test_issue_id=432
-          else
-            test_type="Nightly"
-            test_issue_id=432
-          fi
-          # Test report
-          echo -e "**${test_status}** $test_type Rolling Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
-          printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | rolling-$DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}"
-            if [ "${{ inputs.model }}" != "" ];then
-              test_scope+="; model=${{ inputs.model }}"
-            fi
-            echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
-          fi
-          echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt
-          echo "$cc_comment" >> ${{ github.workspace }}/report.txt
-          # Report
-          report_txt=$(cat ${{ github.workspace }}/report.txt)
-          gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
deleted file mode 100644
index 23f5456f28..0000000000
--- a/.github/workflows/nightly_ondemand_whl.yml
+++ /dev/null
@@ -1,396 +0,0 @@
-name: Torch Nightly WHL Tests
-
-on:
-  schedule:
-    # GMT+8 21:00 every workday
-    - cron: '0 14 * * 0-4'
-    # GMT+8 0:00 Saturday
-    - cron: '0 17 * * 5'
-  workflow_dispatch:
-    inputs:
-      pytorch:
-        required: false
-        type: string
-        default: 'nightly'
-        description: Pytorch branch/commit
-      ut:
-        required: false
-        type: string
-        default: 'torch_xpu'
-        description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma
-      suite:
-        required: true
-        type: string
-        default: 'huggingface'
-        description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
-      dt:
-        required: true
-        type: string
-        default: 'float32'
-        description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
-      mode:
-        required: true
-        type: string
-        default: 'inference'
-        description: Test mode. `inference,training`. Delimiter is comma
-      scenario:
-        required: true
-        type: string
-        default: 'accuracy'
-        description: Test scenario. `accuracy,performance`. Delimiter is comma
-      model:
-        required: false
-        type: string
-        default: ''
-        description: Model. Will only run this one mode if set
-      python:
-        required: false
-        type: string
-        default: '3.10'
-        description: Python version
-
-permissions: read-all
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: ${{ github.event_name != 'schedule' }}
-
-jobs:
-  Linux-Nightly-Ondemand-UT-WHL-Tests:
-    if: ${{ (github.event_name == 'schedule' || inputs.ut != '') && github.repository_owner == 'intel' }}
-    uses: ./.github/workflows/_linux_ut.yml
-    with:
-      ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      pytorch: nightly_wheel
-      runner: linux.idc.xpu
-
-  Linux-Nightly-Ondemand-E2E-WHL-Tests:
-    runs-on: pvc_e2e
-    if: ${{ github.repository_owner == 'intel' }}
-    timeout-minutes: 3600
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      reference_issue: 1645
-      pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 17 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }}
-    outputs:
-      TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }}
-      TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }}
-      TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }}
-      TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
-      TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
-      TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
-      TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
-      TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
-      TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
-      DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
-      KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
-      BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
-      OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
-      GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
-      TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Conda ENV
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
-          source activate e2e_ci
-          pip install pandas scipy psutil requests
-      - name: Prepare Stock Pytorch
-        id: installed
-        run: |
-          pwd
-          cd ../
-          source activate e2e_ci
-          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-          TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-          echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          rm -rf pytorch || sudo rm -rf pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch && git checkout ${TORCH_COMMIT_ID}
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git diff && git show -s
-      - name: Identify pinned versions
-        id: pinned
-        run: |
-          source activate e2e_ci
-          source .github/scripts/env.sh
-          echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          cd ../pytorch
-          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          . /etc/os-release
-          echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo ${GITHUB_ENV}
-      - name: Show GITHUB_ENV
-        run: |
-          echo "$GITHUB_ENV"
-          rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
-
-      # Nihglty launch
-      - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          pytorch: nightly_wheel
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          dt: float16
-          mode: training
-          scenario: accuracy
-          pytorch: nightly_wheel
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly PT2E Accuracy Test
-        if: ${{ env.run_type == 'nightly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          dt: float32,int8
-          scenario: accuracy,performance
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          env_prepare: true
-
-      # Weekly launch
-      - name: Weekly Huggingface Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly Torchbench Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly Timm_models Full Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          env_prepare: true
-          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
-          mode: inference,training
-          scenario: accuracy,performance
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Weekly PT2E Accuracy Test
-        if: ${{ env.run_type == 'weekly' }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: float32,int8
-          scenario: accuracy,performance
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-      # On-demand launch
-      - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: ${{ inputs.suite }}
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          mode: ${{ inputs.mode }}
-          scenario: ${{ inputs.scenario }}
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }}
-        uses: ./.github/actions/pt2e
-        with:
-          env_prepare: true
-          dt: ${{ inputs.dt }}
-          scenario: ${{ inputs.scenario }}
-          pytorch: nightly_wheel
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Download Reference Artifact
-        id: reference_id
-        run: |
-          set -xe
-          source activate e2e_ci
-          conda install gh --channel conda-forge -y
-          if [ "${{ env.run_type }}" == "on-demand" ];then
-            artifact_type="weekly"
-          else
-            artifact_type="${{ env.run_type }}"
-          fi
-          REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
-            --json body -q .body |grep "Inductor-${artifact_type}-Pre-XPU-E2E" |sed 's/.*: *//')"
-          gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
-          rm -rf reference && mv Inductor-*-XPU-E2E-* reference
-      - name: Summarize archieve files
-        id: summary
-        if: ${{ ! cancelled() }}
-        run: |
-          set -x -e -o pipefail
-          rm -rf ${{ github.workspace }}/upload_files
-          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/
-          find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days
-          tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs
-          # Print summary
-          if [ "${{ inputs.suite }}" != 'pt2e' ];then
-            source activate e2e_ci
-            bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
-                ${{ github.workspace }}/upload_files \
-                ${{ github.workspace }}/reference \
-            >> ${GITHUB_STEP_SUMMARY}
-            exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
-            if [ ${exit_label} -ne 0 ];then
-              grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
-              echo "There are ${exit_label} cases that need look into!!! Please check them"
-              exit ${exit_label}
-            fi
-          fi
-          pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")"
-          if [ -f "${pt2e_summary_csv}" ];then
-            cat ${pt2e_summary_csv}
-            failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l)
-            if [ ${failed_num} -ne 0 ];then
-              echo "[Warning] PT2E has failures!"
-            fi
-          fi
-      - name: Upload Inductor XPU E2E Data
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-${{ env.run_type }}-Pre-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/upload_files
-      - name: Upload Reference Run ID
-        if: ${{ env.run_type != 'on-demand' }}
-        run: |
-          gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \
-            sed "s/Inductor-${{ env.run_type }}-Pre-XPU-E2E:.*/Inductor-${{ env.run_type }}-Pre-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt
-          gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt
-
-  Tests-Failure-And-Report:
-    if: ${{ ! cancelled() }}
-    runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
-    env:
-      GH_TOKEN: ${{ github.token }}
-      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-    needs: Linux-Nightly-Ondemand-E2E-WHL-Tests
-    steps:
-      - name: Report github issue for XPU OPS nightly
-        if: github.repository_owner == 'intel'
-        run: |
-          set -xe
-          # Test env
-          build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          repo="${{ github.repository }}"
-          TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}"
-          TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}"
-          TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}"
-          DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}"
-          KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}"
-          BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}"
-          OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}"
-          GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}"
-          TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
-          TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHVISION_COMMIT_ID }}"
-          TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
-          TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRANSFORMERS_VERSION }}"
-          TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMM_COMMIT_ID }}"
-          TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRITON_COMMIT_ID }}"
-          TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMEOUT_MODELS }}"
-          # Test status
-          if [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "success" ];then
-            test_status=Success
-          elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "failure" ];then
-            test_status=Failure
-            cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
-          else
-            test_status=None
-            exit 0
-          fi
-          # Test Type
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_type="On-demand"
-            test_issue_id=426
-            cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
-          elif [ "${{ github.event.schedule }}" == "0 17 * * 5" ];then
-            test_type="Weekly"
-            test_issue_id=432
-          else
-            test_type="Nightly"
-            test_issue_id=432
-          fi
-          # Test report
-          echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
-          printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
-          echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
-          printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
-          if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
-            test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}"
-            if [ "${{ inputs.model }}" != "" ];then
-              test_scope+="; model=${{ inputs.model }}"
-            fi
-            echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
-          fi
-          echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt
-          echo "$cc_comment" >> ${{ github.workspace }}/report.txt
-          # Report
-          report_txt=$(cat ${{ github.workspace }}/report.txt)
-          gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3f3b1c1b58..23683fa701 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -21,9 +21,8 @@ concurrency:
 
 jobs:
   preci-lint-check:
-    name: preci-lint-check
     if: ${{ github.repository_owner == 'intel' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     timeout-minutes: 30
     steps:
       - name: Checkout torch-xpu-ops
@@ -51,11 +50,9 @@ jobs:
           export CLANG=1
           bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh
 
-  preci-conditions-filter:
-    name: preci-conditions-filter
-    if: ${{ github.event.pull_request.draft == false }}
-    needs: [preci-lint-check]
-    runs-on: ubuntu-22.04
+  conditions-filter:
+    if: ${{ github.repository_owner == 'intel' && github.event.pull_request.draft == false }}
+    runs-on: ubuntu-latest
     timeout-minutes: 10
     env:
       GH_TOKEN: ${{ github.token }}
@@ -92,175 +89,58 @@ jobs:
           disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
           echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
 
-  preci-linux-build:
-    name: preci-linux
-    if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}}
-    needs: [preci-conditions-filter]
+  linux-build:
+    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}}
+    needs: [conditions-filter, preci-lint-check]
     secrets: inherit
     uses: ./.github/workflows/_linux_build.yml
     with:
+      runner: pvc_rolling
+      test_type: build-cicd
       pytorch: main
-      runner: pvc_e2e
+      torch_xpu_ops: cicd
 
-  preci-linux-ut:
-    name: preci-linux
-    needs: [preci-conditions-filter, preci-linux-build]
+  linux-ut:
+    needs: [conditions-filter, linux-build]
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
-      ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+      test_type: build-cicd-ut
+      pytorch: main
+      torch_xpu_ops: cicd
+      ut: ut_regression,ut_transformers,ut_extended,ut_op,xpu_dev1,xpu_distributed
+      disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }}
 
-  preci-linux-e2e:
-    if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
-    name: preci-linux / e2e_test
-    needs: [preci-conditions-filter, preci-linux-build]
-    runs-on: pvc_e2e
-    env:
-      GH_TOKEN: ${{ github.token }}
-      reference_issue: 1645
-    timeout-minutes: 300
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Conda ENV
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=3.10 cmake ninja -y
-          source activate e2e_ci
-          pip install pandas scipy psutil requests
-      - name: Download Pytorch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number }}
-      - name: Install Pytorch XPU
-        run: |
-          source activate e2e_ci
-          pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-          TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-          cd ../
-          rm -rf pytorch || sudo rm -rf pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch && git checkout ${TORCH_COMMIT_ID}
-          # apply PRs for stock pytorch
-          # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
-          git show -s && git status && git diff
-      - name: Triton Installation
-        run: |
-          source activate e2e_ci
-          cd ../pytorch
-          pip install cmake ninja pybind11
-          rm -rf pytorch_triton_xpu-*.whl
-          python .github/scripts/build_triton_wheel.py --device xpu
-          pip install pytorch_triton_xpu-*.whl
-      - name: Identify pinned versions
-        run: |
-          cd ../pytorch
-          echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
-          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
-          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
-          . /etc/os-release
-          echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          source ../torch-xpu-ops/.github/scripts/env.sh
-          echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-      - name: Torch Config
-        run: |
-          echo "$GITHUB_ENV"
-          rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log
-          rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
-          cd ..
-          source activate e2e_ci
-          python -c "import triton; print(triton.__version__)"
-          python pytorch/torch/utils/collect_env.py
-      - name: Huggingface BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: bfloat16
-          mode: training
-          scenario: accuracy,performance
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Huggingface FP16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: float16
-          mode: training
-          scenario: accuracy,performance
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Timm_models BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          dt: bfloat16
-          mode: training
-          scenario: accuracy,performance
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Torchbench BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          dt: bfloat16
-          mode: training
-          scenario: accuracy,performance
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Download Reference Artifact
-        id: reference_id
-        run: |
-          set -xe
-          source activate e2e_ci
-          conda install gh --channel conda-forge -y
-          REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
-            --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')"
-          gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
-          rm -rf reference && mv Inductor-*-XPU-E2E-* reference
-      - name: Summarize archieve files
-        if: ${{ ! cancelled() }}
-        run: |
-          set -x -e -o pipefail
-          rm -rf ${{ github.workspace }}/upload_files || sudo rm -rf ${{ github.workspace }}/upload_files
-          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          # Print summary
-          source activate e2e_ci
-          export IS_PR=1
-          bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
-              ${{ github.workspace }}/upload_files \
-              ${{ github.workspace }}/reference \
-          >> ${GITHUB_STEP_SUMMARY}
-          exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
-          if [ ${exit_label} -ne 0 ];then
-            grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
-            echo "There are ${exit_label} cases that need look into!!! Please check them"
-            exit ${exit_label}
-          fi
-      - name: Upload Inductor XPU E2E Data
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/upload_files
+  linux-e2e:
+    name: linux-e2e
+    if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
+    needs: [conditions-filter, linux-build]
+    strategy:
+      fail-fast: false
+      matrix:
+        suite: [huggingface, timm_models, torchbench]
+    uses: ./.github/workflows/_linux_e2e.yml
+    with:
+      runner: pvc_rolling
+      test_type: build-cicd-e2e
+      pytorch: main
+      suite: ${{ matrix.suite }}
+  linux-e2e-summary:
+    if: ${{ ! cancelled() }}
+    name: linux-e2e
+    permissions: write-all
+    needs: [linux-e2e]
+    uses: ./.github/workflows/_linux_e2e_summary.yml
+    with:
+      test_type: build-cicd-e2e
 
-  preci-windows:
-    name: preci-windows
-    if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }} 
-    needs: [preci-conditions-filter]
+  windows:
+    name: windows
+    if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} 
+    needs: [conditions-filter, preci-lint-check]
     uses: ./.github/workflows/_windows_ut.yml
     with: 
-      ut: op_extended,torch_xpu
+      ut: ut_extended,ut_torch
       runner: Windows_CI
-      src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }}
-      has_label: ${{ needs.preci-conditions-filter.outputs.has_label }}
+      src_changed: ${{ needs.conditions-filter.outputs.src_changed }}
+      has_label: ${{ needs.conditions-filter.outputs.has_label }}
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 01a608ae6d..49f3be5876 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -1,6 +1,7 @@
 import os
 import sys
 
+import torch
 from skip_list_common import skip_dict
 from skip_list_win import skip_dict as skip_dict_win
 
@@ -16,8 +17,20 @@
     skip_options += skip_option
 skip_options += '"'
 
+# pytest options
+xpu_num = torch.xpu.device_count()
+parallel_options = (
+    " --dist worksteal "
+    + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)])
+    if xpu_num > 1
+    else " -n 1 "
+)
+test_options = f" --timeout 600 --timeout_method=thread {parallel_options} "
+
 os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
-test_command = "pytest --timeout 600 -v --timeout_method=thread --junit-xml=./op_extended.xml test_ops_xpu.py"
+test_command = (
+    f" pytest {test_options} -v --junit-xml=./ut_extended.xml test_ops_xpu.py "
+)
 test_command += skip_options
 res = os.system(test_command)
 sys.exit(res)
diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py
index 9d70896b11..06ebc87e8d 100644
--- a/test/xpu/run_test_with_only.py
+++ b/test/xpu/run_test_with_only.py
@@ -1,12 +1,25 @@
 import os
 import sys
 
+import torch
+
 # Cases in the file is too slow to run all suites on CPU. So add white list.
 
 
 def launch_test(test_case, skip_list=None, exe_list=None):
     os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1"
     os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
+
+    # pytest options
+    xpu_num = torch.xpu.device_count()
+    parallel_options = (
+        " --dist worksteal "
+        + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)])
+        if xpu_num > 1
+        else " -n 1 "
+    )
+    test_options = f" --timeout 600 --timeout_method=thread {parallel_options} "
+
     if skip_list is not None:
         skip_options = ' -k "not ' + skip_list[0]
         for skip_case in skip_list[1:]:
@@ -14,8 +27,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             skip_options += skip_option
         skip_options += '"'
         test_command = (
-            "pytest --timeout 600 -v "
-            + "--junit-xml=./op_ut_with_only.xml "
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml "
             + test_case
             + skip_options
         )
@@ -27,15 +39,14 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             exe_options += exe_option
         exe_options += '"'
         test_command = (
-            "pytest --timeout 600 -v "
-            + "--junit-xml=./op_ut_with_only.xml "
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml "
             + test_case
             + exe_options
         )
         return os.system(test_command)
     else:
         test_command = (
-            "pytest --timeout 600 -v --junit-xml=./op_ut_with_only.xml " + test_case
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case
         )
         return os.system(test_command)
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index df524100b3..26c0152f71 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1163,6 +1163,17 @@ def copy_tests(
 def launch_test(test_case, skip_list=None, exe_list=None):
     os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1"
     os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
+
+    # pytest options
+    xpu_num = torch.xpu.device_count()
+    parallel_options = (
+        " --dist worksteal "
+        + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)])
+        if xpu_num > 1
+        else " -n 1 "
+    )
+    test_options = f" --timeout 600 --timeout_method=thread {parallel_options} "
+
     if skip_list is not None:
         skip_options = ' -k "not ' + skip_list[0]
         for skip_case in skip_list[1:]:
@@ -1170,7 +1181,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             skip_options += skip_option
         skip_options += '"'
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml "
             + test_case
         )
         test_command += skip_options
@@ -1181,13 +1192,13 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             exe_options += exe_option
         exe_options += '"'
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml "
             + test_case
         )
         test_command += exe_options
     else:
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
+            f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml "
             + test_case
         )
     return os.system(test_command)