Skip to content

Commit b5eb081

Browse files
better resolve binary (#19856)
1 parent aa0b5aa commit b5eb081

File tree

2 files changed

+86
-0
lines changed

2 files changed

+86
-0
lines changed

llama-index-core/llama_index/core/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
import platformdirs
3636
import requests
37+
from urllib.parse import urlparse
3738

3839
if TYPE_CHECKING:
3940
from nltk.tokenize import PunktSentenceTokenizer
@@ -657,6 +658,37 @@ def resolve_binary(
657658
return BytesIO(data)
658659

659660
elif url is not None:
661+
parsed_url = urlparse(url)
662+
if parsed_url.scheme == "data":
663+
# Parse data URL: data:[<mediatype>][;base64],<data>
664+
# The path contains everything after "data:"
665+
data_part = parsed_url.path
666+
667+
# Split on the first comma to separate metadata from data
668+
if "," not in data_part:
669+
raise ValueError("Invalid data URL format: missing comma separator")
670+
671+
metadata, url_data = data_part.split(",", 1)
672+
is_base64_encoded = metadata.endswith(";base64")
673+
674+
if is_base64_encoded:
675+
# Data is base64 encoded in the URL
676+
decoded_data = base64.b64decode(url_data)
677+
if as_base64:
678+
# Return as base64 bytes
679+
return BytesIO(base64.b64encode(decoded_data))
680+
else:
681+
# Return decoded binary data
682+
return BytesIO(decoded_data)
683+
else:
684+
# Data is not base64 encoded in the URL (URL-encoded text)
685+
if as_base64:
686+
# Encode the text data as base64
687+
return BytesIO(base64.b64encode(url_data.encode("utf-8")))
688+
else:
689+
# Return as text bytes
690+
return BytesIO(url_data.encode("utf-8"))
691+
660692
headers = {
661693
"User-Agent": "LlamaIndex/0.0 (https://llamaindex.ai; [email protected]) llama-index-core/0.0"
662694
}

llama-index-core/tests/base/llms/test_types.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,46 @@ def test_image_block_resolve_image_url(png_1px_b64: bytes, png_1px: bytes):
201201
assert img.read() == png_1px_b64
202202

203203

204+
def test_image_block_resolve_image_data_url_base64(png_1px_b64: bytes, png_1px: bytes):
205+
# Test data URL with base64 encoding
206+
data_url = f"data:image/png;base64,{png_1px_b64.decode('utf-8')}"
207+
b = ImageBlock(url=AnyUrl(url=data_url))
208+
209+
img = b.resolve_image()
210+
assert isinstance(img, BytesIO)
211+
assert img.read() == png_1px
212+
213+
img = b.resolve_image(as_base64=True)
214+
assert isinstance(img, BytesIO)
215+
assert img.read() == png_1px_b64
216+
217+
218+
def test_image_block_resolve_image_data_url_plain_text():
219+
# Test data URL with plain text (no base64)
220+
test_text = "Hello, World!"
221+
data_url = f"data:text/plain,{test_text}"
222+
b = ImageBlock(url=AnyUrl(url=data_url))
223+
224+
img = b.resolve_image()
225+
assert isinstance(img, BytesIO)
226+
assert img.read() == test_text.encode("utf-8")
227+
228+
img = b.resolve_image(as_base64=True)
229+
assert isinstance(img, BytesIO)
230+
assert img.read() == base64.b64encode(test_text.encode("utf-8"))
231+
232+
233+
def test_image_block_resolve_image_data_url_invalid():
234+
# Test invalid data URL format (missing comma)
235+
invalid_data_url = "data:image/png;base64"
236+
b = ImageBlock(url=AnyUrl(url=invalid_data_url))
237+
238+
with pytest.raises(
239+
ValueError, match="Invalid data URL format: missing comma separator"
240+
):
241+
b.resolve_image()
242+
243+
204244
def test_image_block_resolve_error():
205245
with pytest.raises(
206246
ValueError, match="No valid source provided to resolve binary data!"
@@ -383,6 +423,20 @@ def test_video_block_resolve_video_url(mp4_bytes: bytes, mp4_base64: bytes):
383423
assert vid.read() == mp4_base64
384424

385425

426+
def test_video_block_resolve_video_data_url_base64(mp4_bytes: bytes, mp4_base64: bytes):
427+
# Test data URL with base64 encoding
428+
data_url = f"data:video/mp4;base64,{mp4_base64.decode('utf-8')}"
429+
b = VideoBlock(url=AnyUrl(url=data_url))
430+
431+
vid = b.resolve_video()
432+
assert isinstance(vid, BytesIO)
433+
assert vid.read() == mp4_bytes
434+
435+
vid = b.resolve_video(as_base64=True)
436+
assert isinstance(vid, BytesIO)
437+
assert vid.read() == mp4_base64
438+
439+
386440
def test_video_block_resolve_error():
387441
b = VideoBlock()
388442
with pytest.raises(ValueError, match="No valid source provided"):

0 commit comments

Comments
 (0)