Skip to content

Commit a26d2f4

Browse files
committed
Improve attachment validation and add an audio attachment type
1 parent 91bd06e commit a26d2f4

File tree

4 files changed

+85
-20
lines changed

4 files changed

+85
-20
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ The first ten digits of pi are 3.141592653.
200200

201201
With no arguments, the `user`, `assistant`, `system`, and `say` commands open an external text editor (based on your system or Gptcmd configuration) for message composition.
202202

203-
### Working with images
203+
### Working with attachments
204204
OpenAI's latest models, such as `gppt-4o`, support images alongside text content. Images can be attached to messages with the `image` command, which accepts two arguments: the location of the image, either a URL or path to a local file; and the index of the message to which the image should be attached (if unspecified, it defaults to the last). We'll ask GPT to describe an image by creating a user message and attaching an image from Wikimedia Commons:
205205

206206
```
@@ -225,6 +225,8 @@ Now, we can `send` our message to get a description:
225225
This is a white cane, often used by individuals who are blind or visually impaired to aid in mobility and navigation. It has a handle, a long shaft, and a rounded tip.
226226
```
227227

228+
Similarly, the `audio` command can be used to attach audio files to messages. Its syntax and operation is identical to the `image` command.
229+
228230
### Managing messages
229231
The `pop` command with no argument deletes the last message of a conversation:
230232

src/gptcmd/cli.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from .llm import CompletionError, InvalidAPIParameterError, LLMProviderFeature
4040
from .macros import MacroError, MacroRunner
4141
from .message import (
42+
Audio,
4243
Image,
4344
Message,
4445
MessageAttachment,
@@ -314,9 +315,7 @@ def _max_numeric_token(s: str) -> int:
314315
(
315316
0
316317
if c.lower().startswith(in_lower)
317-
else 1
318-
if in_lower in c.lower()
319-
else 2
318+
else 1 if in_lower in c.lower() else 2
320319
),
321320
# Suffix match (prefer non-digit)
322321
# Heuristic: Prefer unversioned model aliases
@@ -1359,7 +1358,7 @@ def _attachment_url_helper(
13591358
success_callback(msg)
13601359
print(
13611360
self.__class__._fragment(
1362-
attachment_type.__name__ + " added to {msg}", msg
1361+
f"{attachment_type.__name__} added to {{msg}}", msg
13631362
)
13641363
)
13651364
except IndexError:
@@ -1372,6 +1371,29 @@ def do_image(self, arg):
13721371
cmd_name="image", attachment_type=Image, arg=arg
13731372
)
13741373

1374+
def do_audio(self, arg):
1375+
"Attach an audio file at the specified location"
1376+
1377+
def _success(msg):
1378+
if (
1379+
self._account.provider.model
1380+
and "audio" not in self._account.provider.model
1381+
and "gpt-4o-audio-preview"
1382+
in (self._account.provider.valid_models or ())
1383+
):
1384+
print(
1385+
"Warning! The selected model may not support audio. "
1386+
"If sending this conversation fails, try switching to a "
1387+
"audio-capable model with:\nmodel gpt-4o-audio-preview"
1388+
)
1389+
1390+
return self._attachment_url_helper(
1391+
cmd_name="audio",
1392+
attachment_type=Audio,
1393+
arg=arg,
1394+
success_callback=_success,
1395+
)
1396+
13751397
def do_account(self, arg, _print_on_success: bool = True):
13761398
"Switch between configured accounts."
13771399
if not arg:

src/gptcmd/llm/openai.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""
88

99
import inspect
10+
import mimetypes
1011
import re
1112

1213
from collections import namedtuple
@@ -20,7 +21,7 @@
2021
LLMProvider,
2122
LLMResponse,
2223
)
23-
from ..message import Image, Message, MessageRole
24+
from ..message import Audio, Image, Message, MessageRole
2425

2526
import openai
2627

@@ -361,6 +362,18 @@ def format_image_for_openai(img: Image) -> Dict[str, Any]:
361362
return res
362363

363364

365+
@OpenAI.register_attachment_formatter(Audio)
366+
def format_audio_for_openai(a: Audio) -> Dict[str, Any]:
367+
res: Dict[str, Any] = {
368+
"type": "input_audio",
369+
"input_audio": {"data": a.b64},
370+
}
371+
ext = mimetypes.guess_extension(a.mimetype)
372+
if ext:
373+
res["input_audio"]["format"] = ext.lstrip(".")
374+
return res
375+
376+
364377
class StreamedOpenAIResponse(LLMResponse):
365378
def __init__(self, backing_stream: openai.Stream, provider: OpenAI):
366379
self._stream = backing_stream

src/gptcmd/message.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
"""
99

1010
import base64
11+
import binascii
1112
import dataclasses
1213
import mimetypes
14+
import re
1315
import sys
1416
import urllib.parse
1517
import urllib.request
@@ -150,25 +152,44 @@ def __init__(
150152
self._b64: Optional[str] = None
151153
self._mimetype: Optional[str] = None
152154

153-
if url and url.startswith("data:"):
154-
header, b64 = url.split(",", 1)
155-
self.url = url
156-
self._b64 = b64
157-
self._mimetype = header[5:].split(";")[0]
158-
return
159-
160155
if url:
161-
self.url = url
162-
self._mimetype = mimetype
163-
return
164-
165-
if b64 and mimetype:
156+
if url.startswith("data:"):
157+
# data:[<mediatype>][;<param=value>][;base64],<data>
158+
match = re.fullmatch(r"data:([^,]*?),(.*)", url, re.I)
159+
if not match:
160+
raise ValueError(f"Invalid data URL format: {url}")
161+
header, raw_data = match.groups()
162+
parts = header.split(";")
163+
self._mimetype = parts[0] or "application/octet-stream"
164+
is_b64 = any(p.lower() == "base64" for p in parts[1:])
165+
if is_b64:
166+
self._b64 = raw_data
167+
else:
168+
# URL-encoded data, decode first
169+
decoded_bytes = urllib.parse.unquote_to_bytes(raw_data)
170+
self._b64 = base64.b64encode(decoded_bytes).decode()
171+
self.url = url
172+
elif re.match(r"^[a-z][a-z0-9+\-.]*://", url, re.I): # any scheme
173+
self.url = url
174+
self._mimetype = mimetype
175+
else:
176+
raise ValueError(
177+
"URL must be a data: URL or start with a scheme like"
178+
f" http://), got: {url}"
179+
)
180+
elif b64 and mimetype:
166181
self._b64 = b64
167182
self._mimetype = mimetype
168183
self.url = f"data:{mimetype};base64,{b64}"
169-
return
184+
else:
185+
raise ValueError("Provide either url or both b64 and mimetype")
170186

171-
raise ValueError("Provide either url or both b64 and mimetype")
187+
# Validate base64 data if provided
188+
if self._b64:
189+
try:
190+
base64.b64decode(self._b64, validate=True)
191+
except binascii.Error as exc:
192+
raise ValueError("Invalid base64 data") from exc
172193

173194
@classmethod
174195
def from_path(cls, path: str, **kwargs):
@@ -254,6 +275,13 @@ def _deserialize(cls, d: Dict[str, Any]) -> "Image":
254275
return cls(url=d["url"], detail=d.get("detail"))
255276

256277

278+
@attachment_type_registrar.register("audio_url")
279+
class Audio(FileAttachment):
280+
"An audio file."
281+
282+
pass
283+
284+
257285
class UnknownAttachment(MessageAttachment):
258286
"""
259287
A MessageAttachment created when a dict in the form returned by

0 commit comments

Comments
 (0)