From dd3bdb061e62156f00d577e59ce1e58c0e3f932e Mon Sep 17 00:00:00 2001 From: Shabie Iqbal Date: Mon, 29 Sep 2025 00:30:04 +0200 Subject: [PATCH] fix: add helper to handle base64 PDF docs for converse_stream --- src/strands/models/bedrock.py | 54 ++++++++++++++++++++-- tests/strands/models/test_bedrock.py | 68 ++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/src/strands/models/bedrock.py b/src/strands/models/bedrock.py index c6a500597..be0c470fe 100644 --- a/src/strands/models/bedrock.py +++ b/src/strands/models/bedrock.py @@ -4,6 +4,8 @@ """ import asyncio +import base64 +import binascii import json import logging import os @@ -350,6 +352,45 @@ def _should_include_tool_result_status(self) -> bool: else: # "auto" return any(model in self.config["model_id"] for model in _MODELS_INCLUDE_STATUS) + def _coerce_to_bytes(self, value: Any, *, expected_fmt: Optional[str] = None) -> bytes: + """Normalize bytes-like inputs to raw bytes for Bedrock requests. + + Args: + value: Input that should represent binary data. + expected_fmt: Optional file format hint used for error messaging. + + Returns: + Raw bytes suitable for Bedrock's `source` payloads. + + Raises: + TypeError: If the provided value cannot be interpreted as bytes. + """ + if hasattr(value, "read") and callable(value.read): + data = value.read() + if isinstance(data, bytes): + return data + if isinstance(data, str): + return data.encode("utf-8") + return bytes(data) + + if isinstance(value, (bytes, bytearray, memoryview)): + return bytes(value) + + # Base64-encoded strings (optionally data URLs) + if isinstance(value, str): + data_str = value + if data_str.startswith("data:") and ";base64," in data_str: + data_str = data_str.split(",", 1)[1] + + try: + return base64.b64decode(data_str, validate=True) + except binascii.Error as exc: + raise TypeError( + f"document.source.bytes must be raw bytes or a base64-encoded string (format={expected_fmt!r})." + ) from exc + + raise TypeError(f"Unsupported type for bytes conversion: {type(value).__name__}") + def _format_request_message_content(self, content: ContentBlock) -> dict[str, Any]: """Format a Bedrock content block. @@ -382,7 +423,14 @@ def _format_request_message_content(self, content: ContentBlock) -> dict[str, An # Handle source if "source" in document: - result["source"] = {"bytes": document["source"]["bytes"]} + source = document["source"] + + if "bytes" in source: + result["source"] = { + "bytes": self._coerce_to_bytes(source["bytes"], expected_fmt=document.get("format")) + } + else: + raise TypeError("document.source must include 'bytes'") # Handle optional fields if "citations" in document and document["citations"] is not None: @@ -405,7 +453,7 @@ def _format_request_message_content(self, content: ContentBlock) -> dict[str, An source = image["source"] formatted_source = {} if "bytes" in source: - formatted_source = {"bytes": source["bytes"]} + formatted_source = {"bytes": self._coerce_to_bytes(source["bytes"], expected_fmt=image.get("format"))} result = {"format": image["format"], "source": formatted_source} return {"image": result} @@ -470,7 +518,7 @@ def _format_request_message_content(self, content: ContentBlock) -> dict[str, An source = video["source"] formatted_source = {} if "bytes" in source: - formatted_source = {"bytes": source["bytes"]} + formatted_source = {"bytes": self._coerce_to_bytes(source["bytes"], expected_fmt=video.get("format"))} result = {"format": video["format"], "source": formatted_source} return {"video": result} diff --git a/tests/strands/models/test_bedrock.py b/tests/strands/models/test_bedrock.py index 96fee67fa..27ed02c97 100644 --- a/tests/strands/models/test_bedrock.py +++ b/tests/strands/models/test_bedrock.py @@ -1,3 +1,4 @@ +import base64 import os import sys import unittest.mock @@ -421,6 +422,73 @@ def test_format_request_tool_specs(model, messages, model_id, tool_spec): assert tru_request == exp_request +def test_format_request_document_base64_bytes(model, model_id): + pdf_bytes = b"%PDF-1.4 test pdf" + encoded = base64.b64encode(pdf_bytes).decode("ascii") + messages = [ + { + "role": "user", + "content": [ + { + "document": { + "name": "testing.pdf", + "format": "pdf", + "source": {"bytes": encoded}, + } + } + ], + } + ] + + request = model.format_request(messages) + + doc_source = request["messages"][0]["content"][0]["document"]["source"] + assert doc_source["bytes"] == pdf_bytes + + +def test_format_request_document_plain_text_raises(model): + messages = [ + { + "role": "user", + "content": [ + { + "document": { + "name": "testing.pdf", + "format": "pdf", + "source": {"bytes": "this is not base64"}, + } + } + ], + } + ] + + with pytest.raises(TypeError): + model.format_request(messages) + + +def test_format_request_document_raw_bytes(model): + pdf_bytes = b"%PDF-1.4 test pdf" + messages = [ + { + "role": "user", + "content": [ + { + "document": { + "name": "testing.pdf", + "format": "pdf", + "source": {"bytes": pdf_bytes}, + } + } + ], + } + ] + + request = model.format_request(messages) + + doc_source = request["messages"][0]["content"][0]["document"]["source"] + assert doc_source["bytes"] == pdf_bytes + + def test_format_request_tool_choice_auto(model, messages, model_id, tool_spec): tool_choice = {"auto": {}} tru_request = model.format_request(messages, [tool_spec], tool_choice=tool_choice)