Improve attachment validation and add an audio attachment type

codeofdusk · codeofdusk · commit a26d2f45f129 · 2025-07-06T01:29:35.000-07:00
diff --git a/README.md b/README.md
@@ -200,7 +200,7 @@ The first ten digits of pi are 3.141592653.
 
 With no arguments, the `user`, `assistant`, `system`, and `say` commands open an external text editor (based on your system or Gptcmd configuration) for message composition.
 
-### Working with images
+### Working with attachments
 OpenAI's latest models, such as `gppt-4o`, support images alongside text content. Images can be attached to messages with the `image` command, which accepts two arguments: the location of the image, either a URL or path to a local file; and the index of the message to which the image should be attached (if unspecified, it defaults to the last). We'll ask GPT to describe an image by creating a user message and attaching an image from Wikimedia Commons:
 
 ```
@@ -225,6 +225,8 @@ Now, we can `send` our message to get a description:
 This is a white cane, often used by individuals who are blind or visually impaired to aid in mobility and navigation. It has a handle, a long shaft, and a rounded tip.
 ```
 
+Similarly, the `audio` command can be used to attach audio files to messages. Its syntax and operation is identical to the `image` command.
+
 ### Managing messages
 The `pop` command with no argument deletes the last message of a conversation:
 
diff --git a/src/gptcmd/cli.py b/src/gptcmd/cli.py
@@ -39,6 +39,7 @@
 from .llm import CompletionError, InvalidAPIParameterError, LLMProviderFeature
 from .macros import MacroError, MacroRunner
 from .message import (
+    Audio,
     Image,
     Message,
     MessageAttachment,
@@ -314,9 +315,7 @@ def _max_numeric_token(s: str) -> int:
                 (
                     0
                     if c.lower().startswith(in_lower)
-                    else 1
-                    if in_lower in c.lower()
-                    else 2
+                    else 1 if in_lower in c.lower() else 2
                 ),
                 # Suffix match (prefer non-digit)
                 # Heuristic: Prefer unversioned model aliases
@@ -1359,7 +1358,7 @@ def _attachment_url_helper(
                 success_callback(msg)
             print(
                 self.__class__._fragment(
-                    attachment_type.__name__ + " added to {msg}", msg
+                    f"{attachment_type.__name__} added to {{msg}}", msg
                 )
             )
         except IndexError:
@@ -1372,6 +1371,29 @@ def do_image(self, arg):
             cmd_name="image", attachment_type=Image, arg=arg
         )
 
+    def do_audio(self, arg):
+        "Attach an audio file at the specified location"
+
+        def _success(msg):
+            if (
+                self._account.provider.model
+                and "audio" not in self._account.provider.model
+                and "gpt-4o-audio-preview"
+                in (self._account.provider.valid_models or ())
+            ):
+                print(
+                    "Warning! The selected model may not support audio. "
+                    "If sending this conversation fails, try switching to a "
+                    "audio-capable model with:\nmodel gpt-4o-audio-preview"
+                )
+
+        return self._attachment_url_helper(
+            cmd_name="audio",
+            attachment_type=Audio,
+            arg=arg,
+            success_callback=_success,
+        )
+
     def do_account(self, arg, _print_on_success: bool = True):
         "Switch between configured accounts."
         if not arg:
diff --git a/src/gptcmd/llm/openai.py b/src/gptcmd/llm/openai.py
@@ -7,6 +7,7 @@
 """
 
 import inspect
+import mimetypes
 import re
 
 from collections import namedtuple
@@ -20,7 +21,7 @@
     LLMProvider,
     LLMResponse,
 )
-from ..message import Image, Message, MessageRole
+from ..message import Audio, Image, Message, MessageRole
 
 import openai
 
@@ -361,6 +362,18 @@ def format_image_for_openai(img: Image) -> Dict[str, Any]:
     return res
 
 
+@OpenAI.register_attachment_formatter(Audio)
+def format_audio_for_openai(a: Audio) -> Dict[str, Any]:
+    res: Dict[str, Any] = {
+        "type": "input_audio",
+        "input_audio": {"data": a.b64},
+    }
+    ext = mimetypes.guess_extension(a.mimetype)
+    if ext:
+        res["input_audio"]["format"] = ext.lstrip(".")
+    return res
+
+
 class StreamedOpenAIResponse(LLMResponse):
     def __init__(self, backing_stream: openai.Stream, provider: OpenAI):
         self._stream = backing_stream
diff --git a/src/gptcmd/message.py b/src/gptcmd/message.py
@@ -8,8 +8,10 @@
 """
 
 import base64
+import binascii
 import dataclasses
 import mimetypes
+import re
 import sys
 import urllib.parse
 import urllib.request
@@ -150,25 +152,44 @@ def __init__(
         self._b64: Optional[str] = None
         self._mimetype: Optional[str] = None
 
-        if url and url.startswith("data:"):
-            header, b64 = url.split(",", 1)
-            self.url = url
-            self._b64 = b64
-            self._mimetype = header[5:].split(";")[0]
-            return
-
         if url:
-            self.url = url
-            self._mimetype = mimetype
-            return
-
-        if b64 and mimetype:
+            if url.startswith("data:"):
+                # data:[<mediatype>][;<param=value>][;base64],<data>
+                match = re.fullmatch(r"data:([^,]*?),(.*)", url, re.I)
+                if not match:
+                    raise ValueError(f"Invalid data URL format: {url}")
+                header, raw_data = match.groups()
+                parts = header.split(";")
+                self._mimetype = parts[0] or "application/octet-stream"
+                is_b64 = any(p.lower() == "base64" for p in parts[1:])
+                if is_b64:
+                    self._b64 = raw_data
+                else:
+                    # URL-encoded data, decode first
+                    decoded_bytes = urllib.parse.unquote_to_bytes(raw_data)
+                    self._b64 = base64.b64encode(decoded_bytes).decode()
+                self.url = url
+            elif re.match(r"^[a-z][a-z0-9+\-.]*://", url, re.I):  # any scheme
+                self.url = url
+                self._mimetype = mimetype
+            else:
+                raise ValueError(
+                    "URL must be a data: URL or start with a scheme like"
+                    f" http://), got: {url}"
+                )
+        elif b64 and mimetype:
             self._b64 = b64
             self._mimetype = mimetype
             self.url = f"data:{mimetype};base64,{b64}"
-            return
+        else:
+            raise ValueError("Provide either url or both b64 and mimetype")
 
-        raise ValueError("Provide either url or both b64 and mimetype")
+        # Validate base64 data if provided
+        if self._b64:
+            try:
+                base64.b64decode(self._b64, validate=True)
+            except binascii.Error as exc:
+                raise ValueError("Invalid base64 data") from exc
 
     @classmethod
     def from_path(cls, path: str, **kwargs):
@@ -254,6 +275,13 @@ def _deserialize(cls, d: Dict[str, Any]) -> "Image":
         return cls(url=d["url"], detail=d.get("detail"))
 
 
+@attachment_type_registrar.register("audio_url")
+class Audio(FileAttachment):
+    "An audio file."
+
+    pass
+
+
 class UnknownAttachment(MessageAttachment):
     """
     A MessageAttachment created when a dict in the form returned by