foundation-model-stack
diff --git a/‎.github/workflows/labelpr.yaml‎
Lines changed: 31 additions & 0 deletions b/‎.github/workflows/labelpr.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎.github/workflows/pypi.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/pypi.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 2 deletions b/‎.gitignore‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.spellcheck-en-custom.txt‎
Lines changed: 6 additions & 0 deletions b/‎.spellcheck-en-custom.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 6 deletions b/‎README.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎examples/FP8_QUANT/README.md‎
Lines changed: 10 additions & 12 deletions b/‎examples/FP8_QUANT/README.md‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎examples/QAT_INT8/README.md‎
Lines changed: 9 additions & 4 deletions b/‎examples/QAT_INT8/README.md‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎examples/QAT_INT8/run_qa_no_trainer_qat.py‎
Lines changed: 13 additions & 6 deletions b/‎examples/QAT_INT8/run_qa_no_trainer_qat.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎fms_mo/aiu_addons/__init__.py‎ b/‎fms_mo/aiu_addons/__init__.py‎
@@ -0,0 +1,31 @@
+name: Label PRs
+
+on:
+  pull_request_target:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  label_pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            // https://github.com/commitizen/conventional-commit-types
+            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert', 'dependencies'];
+
+
+            const title = context.payload.pull_request.title;
+            const results = /^(\w+)(\(\w+\))?!?:/.exec(title);
+            if (results === null) return core.setFailed(`The title does not follow conventional commits spec: https://www.conventionalcommits.org/en/v1.0.0/#summary Title: ${title}`);
+
+            const pr_type = results[1];
+            core.info(`pr_type: ${pr_type}`);
+
+            if (!valid_pr_types.includes(pr_type)) return core.setFailed(`Unknown pull request type: ${pr_type}`);
+
+            const labels = context.payload.pull_request.labels;
+            const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
+            new_labels.push({name: pr_type});
+            await github.rest.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
@@ -44,7 +44,7 @@ jobs:
                   # for setuptools-scm
                   fetch-depth: 0
 
-            - uses: hynek/build-and-inspect-python-package@f01e4d047aadcc0c054c95ec9900da3ec3fc7a0f # v2.10.0
+            - uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0
 
     # push to Test PyPI on
     # - a new GitHub release is published
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: Upload to Test PyPI
-              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+              uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -122,4 +122,4 @@ jobs:
               run: rm ./dist/*.sigstore.json
 
             - name: Upload to PyPI
-              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+              uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
@@ -40,9 +40,9 @@ jobs:
     strategy:
       matrix:
         python:
-          - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
         platform:
           - "ubuntu-latest"
 
 
@@ -42,6 +42,7 @@ error.log
 
 # Files generated from running examples
 fms_mo.log
-data_train/
-data_test/
+data*_train/
+data*_test/
 act_scales/
+examples/
@@ -1,4 +1,5 @@
 activations
+acc
 ADR
 Args
 AutoGPTQ
@@ -38,6 +39,7 @@ Inductor
 inferenced
 inferencing
 isort
+JIT
 Jupyter
 Kubernetes
 KV
@@ -66,6 +68,7 @@ NLP
 Nouterloop
 Nvidia
 Nvidia's
+openai
 orchestrator
 param
 pre
@@ -98,13 +101,16 @@ SmoothQuant
 socio
 sparsification
 SQuAD
+stderr
+Stderr
 straightforward
 tokenization
 tokenized
 Tokenized
 tokenizer
 Tokenizer
 toml
+triton
 Unquantized
 vals
 venv
 
@@ -36,9 +36,7 @@ FMS Model Optimizer is a framework for developing reduced precision neural netwo
 ### Requirements
 
 1. **🐧 Linux system with Nvidia GPU (V100/A100/H100)**
-2. Python 3.9 to Python 3.11
-
-    📋 Python 3.12 is currently not supported due to PyTorch Dynamo constraint
+2. Python 3.10 to Python 3.12
 3. CUDA >=12
 
 *Optional packages based on optimization functionality required:*
@@ -47,9 +45,12 @@ FMS Model Optimizer is a framework for developing reduced precision neural netwo
     - [auto_gptq](https://pypi.org/project/auto-gptq/) or build from [source](https://github.com/AutoGPTQ/AutoGPTQ)
 - If you want to experiment with **INT8** deployment in [QAT](./examples/QAT_INT8/) and [PTQ](./examples/PTQ_INT8/) examples:
     - Nvidia GPU with compute capability > 8.0 (A100 family or higher)
-    - [Ninja](https://ninja-build.org/)
-    - Clone the [CUTLASS](https://github.com/NVIDIA/cutlass) repository
-    - `PyTorch 2.3.1` (as newer version will cause issue for the custom CUDA kernel used in these examples)
+    - Option 1:
+        - [Ninja](https://ninja-build.org/)
+        - Clone the [CUTLASS](https://github.com/NVIDIA/cutlass) repository
+        - `PyTorch 2.3.1` (as newer version will cause issue for the custom CUDA kernel used in these examples)
+    - Option 2:
+        - use triton kernel included. But this kernel is currently not faster than FP16.
 - **FP8** is a reduced precision format like **INT8**:
     - Nvidia A100 family or higher
     - [llm-compressor](https://github.com/vllm-project/llm-compressor)
 
@@ -73,20 +73,18 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m
 
 ## Example Test Results
 - BF16 (not quantized) LLAMA3-8B model.
-    ``` bash
-        |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
-        |--------------|------:|------|-----:|----------|---|-----:|---|-----:|
-        |lambada_openai|      1|none  |     5|acc       |↑  |0.7120|±  |0.0287|
-        |              |       |none  |     5|perplexity|↓  |3.8683|±  |0.3716|
-    ```
+
+|    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-----:|---|-----:|
+|lambada_openai|      1|none  |     5|acc       |↑  |0.7120|±  |0.0287|
+|              |       |none  |     5|perplexity|↓  |3.8683|±  |0.3716|
 
 - FP8 quantized LLAMA3-8B model.
-    ``` bash
-        |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
-        |--------------|------:|------|-----:|----------|---|-----:|---|-----:|
-        |lambada_openai|      1|none  |     5|acc       |↑  |0.7160|±  |0.0286|
-        |              |       |none  |     5|perplexity|↓  |3.8915|±  |0.3727|
-    ```
+
+|    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-----:|---|-----:|
+|lambada_openai|      1|none  |     5|acc       |↑  |0.7160|±  |0.0286|
+|              |       |none  |     5|perplexity|↓  |3.8915|±  |0.3727|
 
 ## Code Walk-through
 
 
@@ -87,16 +87,16 @@ python run_qa_no_trainer_qat.py \
   --max_seq_length 384 \
   --doc_stride 128 \
   --attn_impl eager \
-  --do_lowering
+  --do_lowering <cutlass or triton>
 ```
 
-This script uses an "external kernel" instead of the `torch.matmul` kernel to perform real `INT8` matmuls. This kernel is written for Nvidia's CUDA/CUTLASS library and is compiled once just ahead of the run.  The compiled artifacts are usually stored in `~/.cache/torch_extensions/`. Remove this folder if a fresh recompile of the kernel is needed.
+This script uses an "external kernel" instead of the `torch.matmul` kernel to perform real `INT8` matmuls. We have two options for INT kernel, one is written using Nvidia's CUDA/CUTLASS library and one is in Triton. Both will be compiled once just ahead of the run (i.e., just-in-time, JIT, compilation).  The compiled artifacts are usually stored in `~/.cache/torch_extensions/`. Remove this folder if a fresh recompile of the kernel is needed.
 
 Checkout [Example Test Results](#example-test-results) to compare against your results.
 
 ## Example Test Results
 
-For comparison purposes, here are some of the results we found during testing when tested with `PyTorch 2.3.1`:
+For comparison purposes, here are some of the results from an A100. CUTLASS results were obtained with `PyTorch 2.3.1` while Triton results were obtained using `PyTorch 2.4.1`:
 
 > [!NOTE]
 > Accuracy could vary ~ +-0.2 from run to run.
@@ -106,16 +106,21 @@ For comparison purposes, here are some of the results we found during testing wh
 |fp16|128|eager     |88.21 (as fine-tuned) |126.38|
 |    |128|Inductor  |     |71.59|
 |    |128|CUDAGRAPH |     |71.13|
-|INT8|128|eager     |88.33|329.45 <sup>1</sup>|
+|INT8 CUTLASS|128|eager     |88.33|329.45 <sup>1</sup>|
 |    |128|Inductor  |88.42|67.87 <sup>2</sup>|
 |    |128|CUDAGRAPH |--   |-- <sup>3</sup>|
+|INT8 triton|128|eager     |88.10|358.51|
+|    |128|Inductor  |88.13|99.91 <sup>4</sup>|
+|    |128|CUDAGRAPH |88.13|100.21 <sup>4</sup>|
 
 <sup>1</sup> `INT8` matmuls are ~2x faster than `FP16` matmuls. However, `INT8` models will have additional overhead compared to `FP16` models. For example, converting FP tensors to INT before INT matmul.
 
 <sup>2</sup> Each of these additional quantization operations is relatively 'cheap', but the overhead of launching each job is not negligible. Using `torch.compile` can fuse the Ops and reduce the total number of jobs being launched.
 
 <sup>3</sup> `CUDAGRAPH` is the most effective way to minimize job launching overheads and can achieve ~2X end-to-end speed-up in this case. However, there seem to be bugs associated with this option at the moment. Further investigation is still on-going.
 
+<sup>4</sup> Unlike our CUTLASS `INT8` kernel, which is ~2x faster than `FP16` matmul, our Triton `INT8` is not as optimized and performs only comparable with `FP16` on mid-to-large tensor sizes. 
+
 ## Code Walk-through
 
 In this section, we will deep dive into what happens during the example steps.
 
@@ -388,8 +388,10 @@ def parse_args():
     )
     parser.add_argument(
         "--do_lowering",
-        action="store_true",
-        help="convert QAT model to utilize real INT8 GPU kernel",
+        choices=["cutlass", "triton"],
+        type=str,
+        default=None,
+        help="convert QAT model to utilize real INT8 GPU kernel, 'cutlass' or 'triton'",
     )
 
     args = parser.parse_args()
@@ -1086,7 +1088,7 @@ def squad_eval(model, keep_model_in_eval_mode=True):
         qmodel_prep(model, exam_inp, qcfg, optimizer, use_dynamo=True)
 
     # ---- [fms_mo] the following code are performing speed tests ----
-    elif args.do_lowering:
+    elif args.do_lowering in ["cutlass", "triton"]:
         # Standard
         from copy import deepcopy
         import time
@@ -1134,7 +1136,7 @@ def speedtest(model, exam_inp, Ntest=100):
             logger.info(
                 f"\n    {label} {'with' if comp_mode else 'without'} torch.compile"
             )
-            model_copy = deepcopy(model)
+            model_copy = deepcopy(model).half()
 
             if label == "int8":
                 qcfg = qconfig_init(recipe="qat_int8", args=args)
@@ -1158,7 +1160,11 @@ def speedtest(model, exam_inp, Ntest=100):
                     parent_mod = model_copy.get_submodule(parent_name)
                     qmod = getattr(parent_mod, module_name)
                     setattr(
-                        parent_mod, module_name, QLinearINT8Deploy.from_fms_mo(qmod)
+                        parent_mod,
+                        module_name,
+                        QLinearINT8Deploy.from_fms_mo(
+                            qmod, use_int_kernel=args.do_lowering
+                        ),
                     )
 
             if comp_mode is not False:
@@ -1172,7 +1178,7 @@ def speedtest(model, exam_inp, Ntest=100):
 
             # Median runtime using fixed input (in msec)
             med_runtime = speedtest(model_copy, exam_inp)
-            metrics = squad_eval(model_copy) if label == "int8" else {"f1": None}
+            metrics = squad_eval(model_copy)  # if label == "int8" else {"f1": None}
 
             summary["precision"].append(label)
             summary["compile mode"].append(comp_mode)
@@ -1385,6 +1391,7 @@ def speedtest(model, exam_inp, Ntest=100):
         )
         logger.info(f"Predict metrics: {predict_metric}")
 
+    log = {}
     if args.with_tracking:
         log = {
             "squad_v2" if args.version_2_with_negative else "squad": eval_metric,