Add mistral/gpt-oss to benchmarks (pytorch#163565)

angelayi · pytorchmergebot · commit dad54ca7c054 · 2025-09-24T06:12:36.000Z
Potential issues * gpt-oss-20b is probably too big (I can't run on my devserver) * Mistral requires HF authentication * Mistral also takes a while to run the performance checks (need to wait for CI) Pull Request resolved: pytorch#163565 Approved by: https://github.com/huydhn
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
@@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "google/gemma-3-4b-it",
                 "openai/whisper-tiny",
                 "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/check_graph_breaks.py b/benchmarks/dynamo/check_graph_breaks.py
@@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                 "google/gemma-3-4b-it",
                 "openai/whisper-tiny",
                 "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@@ -187,3 +187,11 @@ openai/whisper-tiny,fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,fail_to_run,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
+
+
+
+openai/gpt-oss-20b,fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml
@@ -11,6 +11,8 @@ skip:
     - GPTJForQuestionAnswering
     # Model too big
     - google/gemma-3-4b-it
+    - openai/gpt-oss-20b
+    - mistralai/Mistral-7B-Instruct-v0.3
 
   device:
     cpu:
@@ -19,6 +21,8 @@ skip:
       - google/gemma-3-4b-it
       - openai/whisper-tiny
       - Qwen/Qwen3-0.6B
+      - mistralai/Mistral-7B-Instruct-v0.3
+      - openai/gpt-oss-20b
 
   control_flow:
     - AllenaiLongformerBase
@@ -79,6 +83,8 @@ batch_size:
     google/gemma-3-4b-it: 8
     openai/whisper-tiny: 8
     Qwen/Qwen3-0.6B: 8
+    mistralai/Mistral-7B-Instruct-v0.3: 8
+    openai/gpt-oss-20b: 8
 
 
 tolerance:
diff --git a/benchmarks/dynamo/huggingface_llm_models.py b/benchmarks/dynamo/huggingface_llm_models.py
@@ -99,4 +99,6 @@ def get_model_and_inputs(model_name, device):
     "google/gemma-3-4b-it": TextGenerationBenchmark,
     "openai/whisper-tiny": WhisperBenchmark,
     "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
+    "openai/gpt-oss-20b": TextGenerationBenchmark,
 }
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
@@ -51,3 +51,5 @@ google/gemma-2-2b,8
 google/gemma-3-4b-it,8
 openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
+mistralai/Mistral-7B-Instruct-v0.3, 8
+openai/gpt-oss-20b, 8

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):`
`78`	`78`	`"google/gemma-3-4b-it",`
`79`	`79`	`"openai/whisper-tiny",`
`80`	`80`	`"Qwen/Qwen3-0.6B",`
	`81`	`+ "mistralai/Mistral-7B-Instruct-v0.3",`
	`82`	`+ "openai/gpt-oss-20b",`
`81`	`83`	`}`
`82`	`84`	`)`
`83`	`85`
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):`
`61`	`61`	`"google/gemma-3-4b-it",`
`62`	`62`	`"openai/whisper-tiny",`
`63`	`63`	`"Qwen/Qwen3-0.6B",`
	`64`	`+ "mistralai/Mistral-7B-Instruct-v0.3",`
	`65`	`+ "openai/gpt-oss-20b",`
`64`	`66`	`}`
`65`	`67`	`)`
`66`	`68`