Enhance MLX batch_size functionality and documentation

aj47 · aj47 · commit 8b72232b9ad5 · 2025-06-15T12:50:39.000Z
- Improve MLX transcribe_audio call to include more parameters (task, word_timestamps, verbose, temperature)
- Add comprehensive batch_size documentation in README with usage examples and guidelines
- Add batch_size functionality tests including parsing validation
- Ensure batch_size parameter is properly passed through to lightning-whisper-mlx backend
- Maintain backward compatibility with existing functionality

This ensures the batch_size functionality works exactly like in the source lightning-whisper-mlx repository.
diff --git a/README.md b/README.md
@@ -15,7 +15,9 @@ Over 700+⭐'s because this program this app just works! Works great for windows
 
 ### New in 3.1!
 
-Mac acceleration option using the new [lightning-whisper-mlx](https://github.com/mustafaaljadery/lightning-whisper-mlx) backend. Enable with `--device mlx`. Now supports multiple languages, custom vocabulary via `--initial_prompt`, and both transcribe/translate tasks. 10x faster than Whisper CPP, 4x faster than previous MLX implementations!
+Mac acceleration option using the new [lightning-whisper-mlx](https://github.com/mustafaaljadery/lightning-whisper-mlx) backend. Enable with `--device mlx`. Now supports multiple languages, custom vocabulary via `--initial_prompt`, both transcribe/translate tasks, and **batch processing** for improved throughput. 10x faster than Whisper CPP, 4x faster than previous MLX implementations!
+
+**Batch Processing:** MLX backend supports `--batch_size` parameter (default: 12) for improved throughput. Higher batch sizes provide better performance but require more memory.
 
 **Model Storage:** MLX models are now stored in `~/.cache/whisper/mlx_models/` for consistency with other backends, instead of cluttering your current working directory.
 
@@ -55,6 +57,8 @@ transcribe-anything https://www.youtube.com/watch?v=dQw4w9WgXcQ --device insane
 transcribe-anything https://www.youtube.com/watch?v=dQw4w9WgXcQ --device insane --task translate
 # Mac accelerated back-end
 transcribe-anything https://www.youtube.com/watch?v=dQw4w9WgXcQ --device mlx
+# Mac accelerated with custom batch size for better throughput
+transcribe-anything video.mp4 --device mlx --batch_size 24
 # Use custom prompt for better recognition of specific terms
 transcribe-anything video.mp4 --initial_prompt "The speaker discusses AI, machine learning, and neural networks."
 # Load prompt from file
@@ -174,6 +178,27 @@ Mac:
 
 - Use `--device mlx`
 
+#### MLX Batch Processing
+
+The MLX backend supports batch processing for improved throughput on Apple Silicon:
+
+```bash
+# Default batch size (12)
+transcribe-anything video.mp4 --device mlx
+
+# Custom batch size for better performance
+transcribe-anything video.mp4 --device mlx --batch_size 24
+
+# Lower batch size for memory-constrained systems
+transcribe-anything video.mp4 --device mlx --batch_size 6
+```
+
+**Batch Size Guidelines:**
+- Default: 12 (good balance of speed and memory usage)
+- Higher values (16-24): Better throughput but require more unified memory
+- Lower values (4-8): Use if experiencing memory issues
+- The optimal batch size depends on your model size and available unified memory
+
 # Custom Prompts and Vocabulary
 
 Whisper supports custom prompts to improve transcription accuracy for domain-specific vocabulary, names, or technical terms. This is especially useful when transcribing content with:
diff --git a/src/transcribe_anything/whisper_mac.py b/src/transcribe_anything/whisper_mac.py
@@ -233,6 +233,7 @@ def run_whisper_mac_mlx(  # pylint: disable=too-many-arguments
     word_timestamps = parsed_args.get("word_timestamps", False)
     verbose = parsed_args.get("verbose", False)
     temperature = parsed_args.get("temperature", 0.0)
+    task_param = parsed_args.get("task", task)  # Use parsed task or fallback to function parameter
 
     # Get the environment and run transcription
     env = get_environment()
@@ -291,8 +292,12 @@ def run_whisper_mac_mlx(  # pylint: disable=too-many-arguments
         audio="{input_wav_abs}",
         path_or_hf_repo=str(model_dir),
         language={repr(parsed_args.get("language"))},
+        task={repr(task_param)},
         batch_size={batch_size},
-        initial_prompt={repr(initial_prompt)}
+        initial_prompt={repr(initial_prompt)},
+        word_timestamps={repr(word_timestamps)},
+        verbose={repr(verbose)},
+        temperature={repr(temperature)}
     )
 
     # Print the result as JSON
diff --git a/tests/test_insanely_fast_whisper_mlx.py b/tests/test_insanely_fast_whisper_mlx.py
@@ -101,6 +101,47 @@ def test_multilingual_support(self) -> None:
         self.assertTrue((test_dir / "out.srt").exists())
         self.assertTrue((test_dir / "out.json").exists())
 
+    @unittest.skipUnless(CAN_RUN_TEST, "Not mac")
+    def test_batch_size_functionality(self) -> None:
+        """Check that batch_size parameter works correctly."""
+        test_dir = LOCALFILE_DIR / "text_video_batch_size"
+        shutil.rmtree(test_dir, ignore_errors=True)
+
+        # Test with custom batch_size
+        run_whisper_mac_mlx(
+            input_wav=TEST_WAV,
+            model="small",
+            output_dir=test_dir,
+            language="en",
+            task="transcribe",
+            other_args=["--batch_size", "6"]  # Custom batch size
+        )
+
+        # Verify output files were created
+        self.assertTrue((test_dir / "out.txt").exists())
+        self.assertTrue((test_dir / "out.srt").exists())
+        self.assertTrue((test_dir / "out.json").exists())
+        self.assertTrue((test_dir / "out.vtt").exists())
+
+    @unittest.skipUnless(CAN_RUN_TEST, "Not mac")
+    def test_batch_size_parsing(self) -> None:
+        """Check that batch_size argument parsing works correctly."""
+        from transcribe_anything.whisper_mac import _parse_other_args
+
+        # Test valid batch_size
+        result = _parse_other_args(["--batch_size", "24"])
+        self.assertEqual(result["batch_size"], 24)
+
+        # Test with other arguments
+        result = _parse_other_args(["--language", "en", "--batch_size", "8", "--verbose"])
+        self.assertEqual(result["batch_size"], 8)
+        self.assertEqual(result["language"], "en")
+        self.assertTrue(result["verbose"])
+
+        # Test invalid batch_size (should not crash, just use default)
+        result = _parse_other_args(["--batch_size", "invalid"])
+        self.assertNotIn("batch_size", result)  # Should be filtered out due to ValueError
+
 
 if __name__ == "__main__":
     unittest.main()