kvcache-ai
diff --git a/‎.github/workflows/kt-kernel-tests.yml‎
Lines changed: 104 additions & 0 deletions b/‎.github/workflows/kt-kernel-tests.yml‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎kt-kernel/pyproject.toml‎
Lines changed: 5 additions & 1 deletion b/‎kt-kernel/pyproject.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎kt-kernel/pytest.ini‎
Lines changed: 27 additions & 0 deletions b/‎kt-kernel/pytest.ini‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎kt-kernel/scripts/README.md‎
Lines changed: 33 additions & 0 deletions b/‎kt-kernel/scripts/README.md‎
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,104 @@
+name: PR KT-Kernel Test
+
+on:
+  pull_request:
+    branches:
+      - main
+      - develop
+    types: [synchronize, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: pr-kt-kernel-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # =============================================== check changes ====================================================
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      kt_kernel: ${{ steps.filter.outputs.kt_kernel }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Fail if the PR does not have the 'run-ci' label
+        if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
+        run: |
+          echo "This pull request does not have the 'run-ci' label. Failing the workflow."
+          exit 1
+
+      - name: Fail if the PR is a draft
+        if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
+        run: |
+          echo "This pull request is a draft. Failing the workflow."
+          exit 1
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            kt_kernel:
+              - "kt-kernel/**"
+              - ".github/workflows/kt-kernel-tests.yml"
+
+  # =============================================== KT-Kernel tests ====================================================
+  per-commit-kt-kernel-cpu:
+    needs: [check-changes]
+    if: always() && !failure() && !cancelled() &&
+      (needs.check-changes.outputs.kt_kernel == 'true' || github.event_name == 'workflow_dispatch')
+    runs-on: kt-cpu
+    continue-on-error: false
+    steps:
+      - name: Cleanup
+        run: |
+          sudo rm -rf $GITHUB_WORKSPACE/* || true
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Install KT-Kernel
+        run: |
+          cd kt-kernel
+          bash install.sh build
+
+      - name: Run KT-Kernel CPU tests
+        timeout-minutes: 30
+        run: |
+          cd kt-kernel/test
+          python3 run_suite.py --hw cpu --suite default
+
+  # =============================================== finish ====================================================
+  pr-test-kt-kernel-finish:
+    needs: [check-changes, per-commit-kt-kernel-cpu]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          # Convert the 'needs' context to a JSON string
+          json_needs='${{ toJson(needs) }}'
+
+          # Get a list of all job names from the JSON keys
+          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
+
+          for job in $job_names; do
+            # For each job, extract its result
+            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
+
+            # Print the job name and its result
+            echo "$job: $result"
+
+            # Check for failure or cancellation and exit if found
+            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
+              echo "The above jobs failed."
+              exit 1
+            fi
+          done
+
+          # If the loop completes, all jobs were successful
+          echo "All jobs completed successfully"
+          exit 0
@@ -30,7 +30,11 @@ dependencies = [
   "black>=25.9.0",
 ]
 
-# No optional dev group needed for formatting; using custom git hooks instead of pre-commit
+[project.optional-dependencies]
+test = [
+  "pytest>=7.0.0",
+  "psutil>=5.9.0",
+]
 
 [project.urls]
 Homepage = "https://github.com/kvcache-ai"
 
@@ -0,0 +1,27 @@
+[pytest]
+# Test paths
+testpaths = test/per_commit
+
+# File and function naming conventions
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Markers for hardware backends
+markers =
+    cpu: CPU backend tests (Intel AMX/AVX512/AVX2)
+    cuda: CUDA backend tests (NVIDIA GPUs)
+    amd: AMD backend tests (ROCm)
+    slow: Slow-running tests (>60 seconds)
+    requires_model: Tests requiring model files
+
+# Output options
+addopts =
+    -v
+    --tb=short
+    --strict-markers
+
+# Filter warnings
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
@@ -22,6 +22,8 @@ Convert weights to INT4/INT8 format optimized for AMX inference on CPU. These qu
 - **FP16**: 16-bit floating point
 - **BF16**: BFloat16 format
 
+> **⚠️ Precision Warning:** Quantizing directly from FP8 to INT4/INT8 may cause significant accuracy degradation. For best results, use the original **BF16** model as the source for INT4/INT8 quantization.
+
 ## Basic Usage
 
 ### Quantize BF16 model to INT4
@@ -213,6 +215,37 @@ python scripts/convert_gpu_weights.py \
 - `--dataset`: HuggingFace dataset for calibration
 - `--dataset_split`: Dataset split to use
 
+#### Memory Management (Avoiding OOM)
+
+GPTQ quantization requires additional GPU memory for Hessian matrix computation beyond model weights. Use `--max_gpu_memory` to limit GPU memory usage and offload remaining layers to CPU:
+
+```bash
+python scripts/convert_gpu_weights.py \
+  --model_id /path/to/model \
+  --output_dir /path/to/output \
+  --quant_type W4A16 \
+  --max_gpu_memory "40GiB"
+```
+
+**Recommended settings:**
+
+| GPU VRAM | Suggested `--max_gpu_memory` |
+|----------|------------------------------|
+| 24 GiB   | 14-16 GiB                    |
+| 48 GiB   | 30-35 GiB                    |
+| 80 GiB   | 50-60 GiB                    |
+
+Reserve 40-50% of GPU memory for GPTQ's Hessian matrix computation.
+
+**Options:**
+- `--max_gpu_memory`: Maximum GPU memory for model weights per device (e.g., '40GiB')
+- `--max_cpu_memory`: Maximum CPU memory (default: 1000GiB when `--max_gpu_memory` is set)
+
+**Important:** llmcompressor does not support disk offloading. Ensure your machine has enough GPU + CPU memory to load the entire model. If you still encounter OOM:
+1. Reduce `--num_calibration_samples` (e.g., 256)
+2. Reduce `--max_sequence_length` (e.g., 1024)
+3. Use `--force_cpu` to run entirely on CPU (slower but avoids GPU OOM)
+
 ### Examples
 
 #### Example 1: Quantize Qwen3-Next-80B for Hybrid Inference (W4A16)