Set batch-size to mitigate OOM issues due to larger input sequence than tested in perf benchmark

dgolubovicTT · dgolubovicTT · commit 29c9c3bc101d · 2026-02-12T10:32:03.000Z
diff --git a/.github/workflows/call-perf-test.yml b/.github/workflows/call-perf-test.yml
@@ -197,7 +197,7 @@ jobs:
           python benchmark/benchmark.py -p ${{ matrix.build.project}} -m ${{ matrix.build.name }} -bs ${{ matrix.build.bs }} -df ${{ matrix.build.df }} -lp ${{ matrix.build.lp }} ${{ matrix.build.input_sequence_length && format('-isl {0}', matrix.build.input_sequence_length) }} -ts ${{ matrix.build.ts }} -o ${{ steps.strings.outputs.perf_report_json_file }} ${{ inputs.run_id_source && format('-r {0}', inputs.run_id_source) }}
         else
           # Run with pytest
-          pytest -svv "${{ matrix.build.pytest }}" ${{ matrix.build.accuracy-testing && '--accuracy-testing true' || '' }} --output-file=${{ steps.strings.outputs.perf_report_json_file }}
+          pytest -svv "${{ matrix.build.pytest }}" ${{ matrix.build.accuracy-testing && '--accuracy-testing true' || '' }} ${{ matrix.build['batch-size'] && format('--batch-size {0}', matrix.build['batch-size']) || '' }} --output-file=${{ steps.strings.outputs.perf_report_json_file }}
         fi
 
     - name: Dump stablehlo to report
diff --git a/.github/workflows/perf-bench-matrix.json b/.github/workflows/perf-bench-matrix.json
@@ -322,13 +322,15 @@
         "name": "llama_3_1_8b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b",
-        "accuracy-testing": true
+        "accuracy-testing": true,
+        "batch-size": 16
       },
       {
         "name": "mistral_7b_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1 protobuf sentencepiece",
         "pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b",
-        "accuracy-testing": true
+        "accuracy-testing": true,
+        "batch-size": 8
       },
       {
         "name": "qwen_2_5_7b_instruct_accuracy",
@@ -382,7 +384,8 @@
         "name": "tiiuae_falcon3-7b-base_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b",
-        "accuracy-testing": true
+        "accuracy-testing": true,
+        "batch-size": 4
       },
       {
         "name": "qwen_2_5_0_5b_instruct_accuracy",
@@ -400,7 +403,8 @@
         "name": "qwen_2_5_3b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b",
-        "accuracy-testing": true
+        "accuracy-testing": true,
+        "batch-size": 16
       },
       {
         "name": "qwen_3_0_6b_accuracy",
@@ -430,7 +434,8 @@
         "name": "ministral_8b_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b",
-        "accuracy-testing": true
+        "accuracy-testing": true,
+        "batch-size": 16
       }
     ]
   }
diff --git a/benchmark/tt-xla/test_llms.py b/benchmark/tt-xla/test_llms.py
@@ -79,6 +79,10 @@ def test_llm(
         required_pcc: Required PCC threshold
         accuracy_testing: Enable token accuracy testing with reference data
     """
+    # Set default batch size if None
+    if batch_size is None:
+        batch_size = DEFAULT_BATCH_SIZE
+
     model_loader = create_model_loader(ModelLoaderModule, num_layers=num_layers, variant=variant)
     if num_layers is not None and model_loader is None:
         pytest.fail("num_layers override requested but ModelLoader does not support it.")
@@ -196,7 +200,7 @@ def test_llm_tp(ModelLoaderModule, variant, output_file, num_layers=None, reques
     )
 
 
-def test_llama_3_2_1b(output_file, num_layers, request, accuracy_testing):
+def test_llama_3_2_1b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.llama.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.LLAMA_3_2_1B_INSTRUCT
@@ -207,10 +211,11 @@ def test_llama_3_2_1b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_llama_3_2_3b(output_file, num_layers, request, accuracy_testing):
+def test_llama_3_2_3b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.llama.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.LLAMA_3_2_3B_INSTRUCT
@@ -221,10 +226,11 @@ def test_llama_3_2_3b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_gemma_1_1_2b(output_file, num_layers, request, accuracy_testing):
+def test_gemma_1_1_2b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.gemma.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.GEMMA_1_1_2B_IT
@@ -237,10 +243,11 @@ def test_gemma_1_1_2b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_gemma_2_2b(output_file, num_layers, request, accuracy_testing):
+def test_gemma_2_2b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.gemma.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.GEMMA_2_2B_IT
@@ -253,10 +260,11 @@ def test_gemma_2_2b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_phi1(output_file, num_layers, request, accuracy_testing):
+def test_phi1(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.phi1.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.PHI1
@@ -267,10 +275,11 @@ def test_phi1(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_phi1_5(output_file, num_layers, request, accuracy_testing):
+def test_phi1_5(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.phi1_5.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.PHI1_5
@@ -281,10 +290,11 @@ def test_phi1_5(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_phi2(output_file, num_layers, request, accuracy_testing):
+def test_phi2(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.phi2.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.PHI2
@@ -295,10 +305,11 @@ def test_phi2(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_falcon3_1b(output_file, num_layers, request, accuracy_testing):
+def test_falcon3_1b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.falcon.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.FALCON_1B
@@ -312,10 +323,11 @@ def test_falcon3_1b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_falcon3_3b(output_file, num_layers, request, accuracy_testing):
+def test_falcon3_3b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.falcon.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.FALCON_3B
@@ -329,10 +341,11 @@ def test_falcon3_3b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_2_5_0_5b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_2_5_0_5b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_2_5.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_2_5_0_5B_INSTRUCT
@@ -344,10 +357,11 @@ def test_qwen_2_5_0_5b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_3_0_6b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_3_0_6b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_3.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_3_0_6B
@@ -358,10 +372,11 @@ def test_qwen_3_0_6b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_3_1_7b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_3_1_7b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_3.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_3_1_7B
@@ -372,10 +387,11 @@ def test_qwen_3_1_7b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_3_4b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_3_4b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_3.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_3_4B
@@ -386,10 +402,11 @@ def test_qwen_3_4b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_2_5_1_5b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_2_5_1_5b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_2_5.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_2_5_1_5B_INSTRUCT
@@ -400,10 +417,11 @@ def test_qwen_2_5_1_5b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_2_5_3b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_2_5_3b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_2_5.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_2_5_3B_INSTRUCT
@@ -414,10 +432,11 @@ def test_qwen_2_5_3b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_3_8b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_3_8b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_3.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_3_8B
@@ -428,10 +447,11 @@ def test_qwen_3_8b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_qwen_2_5_7b(output_file, num_layers, request, accuracy_testing):
+def test_qwen_2_5_7b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.qwen_2_5.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.QWEN_2_5_7B_INSTRUCT
@@ -442,6 +462,7 @@ def test_qwen_2_5_7b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
@@ -485,7 +506,7 @@ def test_mamba_2_8b(output_file, num_layers, request):
     )
 
 
-def test_falcon3_7b(output_file, num_layers, request, accuracy_testing):
+def test_falcon3_7b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.falcon.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.FALCON_7B
@@ -499,10 +520,11 @@ def test_falcon3_7b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_mistral_7b(output_file, num_layers, request, accuracy_testing):
+def test_mistral_7b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.mistral.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.MISTRAL_7B_INSTRUCT_V03
@@ -513,10 +535,11 @@ def test_mistral_7b(output_file, num_layers, request, accuracy_testing):
         num_layers=num_layers,
         request=request,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_ministral_8b(output_file, num_layers, request, accuracy_testing):
+def test_ministral_8b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.mistral.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.MINISTRAL_8B
@@ -528,10 +551,11 @@ def test_ministral_8b(output_file, num_layers, request, accuracy_testing):
         request=request,
         fp32_dest_acc_en=False,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )
 
 
-def test_llama_3_1_8b(output_file, num_layers, request, accuracy_testing):
+def test_llama_3_1_8b(output_file, num_layers, request, accuracy_testing, batch_size):
     from third_party.tt_forge_models.llama.causal_lm.pytorch.loader import ModelLoader, ModelVariant
 
     variant = ModelVariant.LLAMA_3_1_8B_INSTRUCT
@@ -543,6 +567,7 @@ def test_llama_3_1_8b(output_file, num_layers, request, accuracy_testing):
         request=request,
         fp32_dest_acc_en=False,
         accuracy_testing=accuracy_testing,
+        batch_size=batch_size,
     )