Merge pull request #1982 from rhatdan/max

rhatdan · web-flow · commit de6c684d7a48 · 2025-10-06T11:15:49.000-04:00
Add unified --max-tokens CLI argument for output token limiting
diff --git a/docs/ramalama-perplexity.1.md b/docs/ramalama-perplexity.1.md
@@ -84,6 +84,13 @@ Accelerated images:
 pass --group-add keep-groups to podman (default: False)
 If GPU device on host system is accessible to user via group access, this option leaks the groups into the container.
 
+#### **--max-tokens**=*integer*
+Maximum number of tokens to generate. Set to 0 for unlimited output (default: 0).
+This parameter is mapped to the appropriate runtime-specific parameter:
+- llama.cpp: `-n` parameter
+- MLX: `--max-tokens` parameter
+- vLLM: `--max-tokens` parameter
+
 #### **--name**, **-n**
 name of the container to run the Model in
 
diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md
@@ -98,6 +98,13 @@ If GPU device on host system is accessible to user via group access, this option
 #### **--keepalive**
 duration to keep a model loaded (e.g. 5m)
 
+#### **--max-tokens**=*integer*
+Maximum number of tokens to generate. Set to 0 for unlimited output (default: 0).
+This parameter is mapped to the appropriate runtime-specific parameter:
+- llama.cpp: `-n` parameter
+- MLX: `--max-tokens` parameter
+- vLLM: `--max-tokens` parameter
+
 #### **--mcp**=SERVER_URL
 MCP (Model Context Protocol) servers to use for enhanced tool calling capabilities.
 Can be specified multiple times to connect to multiple MCP servers.
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -142,6 +142,13 @@ Accelerated images:
 pass --group-add keep-groups to podman (default: False)
 If GPU device on host system is accessible to user via group access, this option leaks the groups into the container.
 
+#### **--max-tokens**=*integer*
+Maximum number of tokens to generate. Set to 0 for unlimited output (default: 0).
+This parameter is mapped to the appropriate runtime-specific parameter:
+- llama.cpp: `-n` parameter
+- MLX: `--max-tokens` parameter
+- vLLM: `--max-tokens` parameter
+
 #### **--model-draft**
 
 A draft model is a smaller, faster model that helps accelerate the decoding
diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
@@ -124,6 +124,11 @@ specified vllm model runtime.
 Pass `--group-add keep-groups` to podman, when using podman.
 In some cases this is needed to access the gpu from a rootless container
 
+**max_tokens**=0
+
+Maximum number of tokens to generate. Set to 0 for unlimited output (default: 0).
+This parameter is mapped to the appropriate runtime-specific parameter when executing models.
+
 **ngl**=-1
 
 number of gpu layers, 0 means CPU inferencing, 999 means use max layers (default: -1)
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -879,6 +879,15 @@ def runtime_options(parser, command):
         help="name of container in which the Model will be run",
         completer=suppressCompleter,
     )
+    if command in ["run", "perplexity", "serve"]:
+        parser.add_argument(
+            "--max-tokens",
+            dest="max_tokens",
+            type=int,
+            default=CONFIG.max_tokens,
+            help="maximum number of tokens to generate (0 = unlimited)",
+            completer=suppressCompleter,
+        )
     add_network_argument(parser, dflt=None)
     parser.add_argument(
         "--ngl",
diff --git a/ramalama/command/context.py b/ramalama/command/context.py
@@ -10,40 +10,42 @@
 class RamalamaArgsContext:
 
     def __init__(self):
-        self.host: Optional[str] = None
-        self.port: Optional[int] = None
-        self.thinking: Optional[bool] = None
-        self.ctx_size: Optional[int] = None
         self.cache_reuse: Optional[int] = None
-        self.temp: Optional[float] = None
+        self.container: Optional[bool] = None
+        self.ctx_size: Optional[int] = None
         self.debug: Optional[bool] = None
-        self.webui: Optional[bool] = None
-        self.ngl: Optional[int] = None
-        self.threads: Optional[int] = None
+        self.host: Optional[str] = None
         self.logfile: Optional[str] = None
-        self.container: Optional[bool] = None
+        self.max_tokens: Optional[int] = None
         self.model_draft: Optional[str] = None
-        self.seed: Optional[int] = None
+        self.ngl: Optional[int] = None
+        self.port: Optional[int] = None
         self.runtime_args: Optional[str] = None
+        self.seed: Optional[int] = None
+        self.temp: Optional[float] = None
+        self.thinking: Optional[bool] = None
+        self.threads: Optional[int] = None
+        self.webui: Optional[bool] = None
 
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RamalamaArgsContext":
         ctx = RamalamaArgsContext()
-        ctx.host = getattr(args, "host", None)
-        ctx.port = getattr(args, "port", None)
-        ctx.thinking = getattr(args, "thinking", None)
+        ctx.cache_reuse = getattr(args, "cache_reuse", None)
+        ctx.container = getattr(args, "container", None)
         ctx.ctx_size = getattr(args, "context", None)
-        ctx.temp = getattr(args, "temp", None)
         ctx.debug = getattr(args, "debug", None)
-        ctx.webui = getattr(args, "webui", None)
-        ctx.ngl = getattr(args, "ngl", None)
-        ctx.threads = getattr(args, "threads", None)
+        ctx.host = getattr(args, "host", None)
         ctx.logfile = getattr(args, "logfile", None)
-        ctx.container = getattr(args, "container", None)
+        ctx.max_tokens = getattr(args, "max_tokens", None)
         ctx.model_draft = getattr(args, "model_draft", None)
-        ctx.seed = getattr(args, "seed", None)
+        ctx.ngl = getattr(args, "ngl", None)
+        ctx.port = getattr(args, "port", None)
         ctx.runtime_args = getattr(args, "runtime_args", None)
-        ctx.cache_reuse = getattr(args, "cache_reuse", None)
+        ctx.seed = getattr(args, "seed", None)
+        ctx.temp = getattr(args, "temp", None)
+        ctx.thinking = getattr(args, "thinking", None)
+        ctx.threads = getattr(args, "threads", None)
+        ctx.webui = getattr(args, "webui", None)
         return ctx
 
 
diff --git a/ramalama/config.py b/ramalama/config.py
@@ -116,10 +116,10 @@ class RamalamaSettings:
 class BaseConfig:
     api: str = "none"
     api_key: str = None
+    cache_reuse: int = 256
     carimage: str = "registry.access.redhat.com/ubi10-micro:latest"
     container: bool = None  # type: ignore
     ctx_size: int = 0
-    cache_reuse: int = 256
     default_image: str = DEFAULT_IMAGE
     dryrun: bool = False
     engine: SUPPORTED_ENGINES | None = field(default_factory=get_default_engine)
@@ -139,6 +139,7 @@ class BaseConfig:
         }
     )
     keep_groups: bool = False
+    max_tokens: int = 0
     ngl: int = -1
     ocr: bool = False
     port: str = str(DEFAULT_PORT)
diff --git a/ramalama/daemon/service/command_factory.py b/ramalama/daemon/service/command_factory.py
@@ -38,6 +38,9 @@ def _set_defaults(self):
         if "temp" not in self.request_args:
             self.request_args["temp"] = CONFIG.temp
 
+        if "max_tokens" not in self.request_args:
+            self.request_args["max_tokens"] = CONFIG.max_tokens
+
         if "ngl" not in self.request_args:
             self.request_args["ngl"] = CONFIG.ngl
 
@@ -104,7 +107,7 @@ def _build_llama_serve_command(self) -> list[str]:
         if self.request_args.get("webui") == "off":
             cmd.extend(["--no-webui"])
 
-        if check_nvidia() or check_metal(SimpleNamespace({"container": False})):
+        if check_nvidia() or check_metal(SimpleNamespace(container=False)):
             cmd.extend(["--flash-attn", "on"])
 
         # gpu arguments
@@ -115,4 +118,9 @@ def _build_llama_serve_command(self) -> list[str]:
         threads = self.request_args.get("threads")
         cmd.extend(["--threads", str(threads)])
 
+        # Add max tokens parameter for llama.cpp
+        max_tokens = self.request_args.get("max_tokens", 0)
+        if max_tokens > 0:
+            cmd.extend(["-n", str(max_tokens)])
+
         return cmd
diff --git a/test/e2e/test_cli_max_tokens.py b/test/e2e/test_cli_max_tokens.py
@@ -0,0 +1,130 @@
+import sys
+from contextlib import redirect_stderr, redirect_stdout
+from subprocess import CalledProcessError
+
+import pytest
+
+
+def run_ramalama_direct(args):
+    """Run ramalama directly via Python import to avoid installation issues"""
+    from ramalama.cli import main
+
+    # Save original sys.argv
+    original_argv = sys.argv[:]
+
+    try:
+        sys.argv = ["ramalama"] + args
+        # Capture stdout by redirecting
+        import io
+
+        stdout_capture = io.StringIO()
+        stderr_capture = io.StringIO()
+
+        with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
+            try:
+                main()
+            except SystemExit as e:
+                # argparse calls sys.exit(), capture the output
+                stdout_content = stdout_capture.getvalue()
+                stderr_content = stderr_capture.getvalue()
+
+                if e.code != 0:  # argparse help exits with 0
+                    # If there was an error, raise CalledProcessError
+                    raise CalledProcessError(e.code, args, stdout_content + stderr_content)
+
+                return stdout_content
+
+        # If no exception, return the captured output
+        return stdout_capture.getvalue()
+
+    finally:
+        # Always restore original sys.argv
+        sys.argv = original_argv
+
+
+@pytest.mark.e2e
+def test_max_tokens_cli_argument_help():
+    """Test that --max-tokens argument appears in help for supported commands"""
+
+    # Test commands that should have --max-tokens
+    supported_commands = ["run", "serve", "perplexity"]
+
+    for command in supported_commands:
+        result = run_ramalama_direct([command, "--help"])
+        assert "--max-tokens" in result, f"--max-tokens should appear in {command} help"
+        assert "maximum number of tokens to generate" in result, f"Help text should be present in {command}"
+
+
+@pytest.mark.e2e
+def test_max_tokens_argument_parsing():
+    """Test that --max-tokens argument is properly parsed"""
+
+    # Test that --max-tokens doesn't cause argument parsing errors
+    # by checking help with the argument present
+    try:
+        result = run_ramalama_direct(["run", "--max-tokens", "512", "--help"])
+        # If we get here, the argument was parsed successfully
+        assert "--max-tokens" in result
+    except CalledProcessError as e:
+        # Should not fail with "unrecognized arguments" for --max-tokens
+        assert "unrecognized arguments: --max-tokens" not in str(e), f"Argument parsing failed: {e}"
+
+
+@pytest.mark.e2e
+def test_max_tokens_valid_values():
+    """Test that max_tokens accepts valid integer values"""
+
+    # Test with various valid integer values
+    valid_values = ["0", "100", "1024", "4096"]
+
+    for value in valid_values:
+        try:
+            result = run_ramalama_direct(["run", "--max-tokens", value, "--help"])
+            # Should not raise parsing errors
+            assert "--max-tokens" in result
+        except CalledProcessError as e:
+            assert "unrecognized arguments" not in str(e), f"Should accept valid value {value}"
+
+
+@pytest.mark.e2e
+def test_max_tokens_default_value():
+    """Test that max_tokens has a sensible default value"""
+
+    result = run_ramalama_direct(["run", "--help"])
+
+    # Check that the default is mentioned in help (should show 0)
+    # Look for the max-tokens line and check it shows default: 0
+    lines = result.split('\n')
+    max_tokens_lines = [line for line in lines if '--max-tokens' in line or 'maximum number of tokens' in line]
+
+    # Should have at least one line mentioning max-tokens
+    assert max_tokens_lines, "Should have help text for --max-tokens"
+
+
+@pytest.mark.e2e
+def test_max_tokens_invalid_value():
+    """Test that max_tokens rejects invalid values"""
+
+    # Test with invalid string value (should be rejected by argparse type checking)
+    try:
+        run_ramalama_direct(["run", "--max-tokens", "invalid", "--help"])
+        # If no exception, this is unexpected but we'll allow it for now
+    except CalledProcessError as e:
+        # Should fail due to invalid type conversion, not unrecognized argument
+        assert "unrecognized arguments: --max-tokens" not in str(e)
+        # argparse should complain about invalid int conversion
+        assert "invalid" in str(e) or "int" in str(e).lower()
+
+
+@pytest.mark.e2e
+def test_max_tokens_negative_value():
+    """Test that max_tokens accepts negative values (though they may be treated as 0)"""
+
+    # Negative values should be accepted by argparse (int type allows them)
+    try:
+        result = run_ramalama_direct(["run", "--max-tokens", "-1", "--help"])
+        # Should not raise parsing errors
+        assert "--max-tokens" in result
+    except CalledProcessError as e:
+        # Should not fail with "unrecognized arguments" for --max-tokens
+        assert "unrecognized arguments: --max-tokens" not in str(e)
diff --git a/test/unit/test_max_tokens.py b/test/unit/test_max_tokens.py