run granite speech with bfloat16.

avihu111 · avihu111 · commit 2d5dec701b5f · 2025-07-08T03:07:54.000-04:00
increase batch sizes to improve gpu utilization
diff --git a/granite/run_eval.py b/granite/run_eval.py
@@ -16,7 +16,7 @@
 def main(args):
     processor = AutoProcessor.from_pretrained(args.model_id)
     tokenizer = processor.tokenizer
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_id).to(args.device)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_id, torch_dtype=torch.bfloat16).to(args.device)
 
     # create text prompt
     chat = [
@@ -45,24 +45,24 @@ def benchmark(batch, min_new_tokens=None):
         # START TIMING
         start_time = time.time()
 
-        with torch.autocast(model.device.type, enabled=True):
-            model_inputs = processor(
-                texts,
-                audios,
-                device=args.device, # Computation device; returned tensors are put on CPU
-                return_tensors="pt",
-            ).to(args.device)
-
-            # Model Inference
-            model_outputs = model.generate(
-                **model_inputs,
-                bos_token_id=tokenizer.bos_token_id,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                repetition_penalty=1.0,
-                **gen_kwargs,
-                min_new_tokens=min_new_tokens,
-            )
+        # with torch.autocast(model.device.type, enabled=True):
+        model_inputs = processor(
+            texts,
+            audios,
+            device=args.device, # Computation device; returned tensors are put on CPU
+            return_tensors="pt",
+        ).to(args.device)
+
+        # Model Inference
+        model_outputs = model.generate(
+            **model_inputs,
+            bos_token_id=tokenizer.bos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            repetition_penalty=1.0,
+            **gen_kwargs,
+            min_new_tokens=min_new_tokens,
+        )
 
         # Transformers includes the input IDs in the response.
         num_input_tokens = model_inputs["input_ids"].shape[-1]
diff --git a/granite/run_granite.sh b/granite/run_granite.sh
@@ -8,8 +8,8 @@ MODEL_IDs=(
 )
 
 BATCH_SIZEs=(
-    20 
-    12
+    160 
+    64
 )
 
 NUM_BEAMS=1

Original file line number	Diff line number	Diff line change
`@@ -8,8 +8,8 @@ MODEL_IDs=(`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`BATCH_SIZEs=(`
`11`		`- 20`
`12`		`- 12`
	`11`	`+ 160`
	`12`	`+ 64`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`NUM_BEAMS=1`