Skip to content

Commit ea0536c

Browse files
committed
Return dict from ai_parse_document and keep metadata fields
1 parent 89d6298 commit ea0536c

File tree

3 files changed

+32
-44
lines changed

3 files changed

+32
-44
lines changed

databend_aiserver/udfs/docparse.py

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -256,29 +256,23 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
256256
full_path = resolved_path or path
257257

258258
# Keep metadata first for predictable JSON ordering.
259-
payload: Dict[str, Any] = OrderedDict(
260-
[
261-
(
262-
"metadata",
263-
{
264-
"chunk_count": chunk_count,
265-
"chunk_size": DEFAULT_CHUNK_SIZE,
266-
"duration_ms": duration_ms,
267-
"file_size": file_size if file_size is not None else 0,
268-
"filename": Path(path).name,
269-
"path": full_path or path,
270-
"timings_ms": {
271-
"convert": (t_convert_end_ns - t_convert_start_ns)
272-
/ 1_000_000.0,
273-
"chunk": (t_chunk_end_ns - t_convert_end_ns) / 1_000_000.0,
274-
"total": duration_ms,
275-
},
276-
"version": 1,
277-
},
278-
),
279-
("chunks", pages),
280-
]
281-
)
259+
payload: Dict[str, Any] = {
260+
"metadata": {
261+
"chunk_count": chunk_count,
262+
"chunk_size": DEFAULT_CHUNK_SIZE,
263+
"duration_ms": duration_ms,
264+
"file_size": file_size if file_size is not None else 0,
265+
"filename": Path(path).name,
266+
"path": full_path or path,
267+
"timings_ms": {
268+
"convert": (t_convert_end_ns - t_convert_start_ns) / 1_000_000.0,
269+
"chunk": (t_chunk_end_ns - t_convert_end_ns) / 1_000_000.0,
270+
"total": duration_ms,
271+
},
272+
"version": 1,
273+
},
274+
"chunks": pages,
275+
}
282276
if fallback:
283277
payload["error_information"] = [
284278
{
@@ -296,19 +290,11 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
296290
)
297291
return payload
298292
except Exception as exc: # pragma: no cover - defensive for unexpected docling errors
299-
return OrderedDict(
300-
[
301-
(
302-
"metadata",
303-
{
304-
"path": path,
305-
"filename": Path(path).name,
306-
},
307-
),
308-
("chunks", []),
309-
(
310-
"error_information",
311-
[{"message": str(exc), "type": exc.__class__.__name__}],
312-
),
313-
]
314-
)
293+
return {
294+
"metadata": {
295+
"path": path,
296+
"filename": Path(path).name,
297+
},
298+
"chunks": [],
299+
"error_information": [{"message": str(exc), "type": exc.__class__.__name__}],
300+
}

tests/integration/test_docparse_integration.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ def _call_docparse(client: UDFClient, path: str, memory_stage):
3434
)
3535
assert len(result) == 1
3636
payload_raw = result[0]
37+
if hasattr(payload_raw, "as_py"):
38+
payload_raw = payload_raw.as_py()
3739
if isinstance(payload_raw, (bytes, bytearray)):
3840
payload_raw = payload_raw.decode("utf-8")
3941
if isinstance(payload_raw, str):
40-
payload = json.loads(payload_raw)
41-
else:
42-
payload = payload_raw
43-
return payload
42+
return json.loads(payload_raw)
43+
return payload_raw
4444

4545

4646
def _normalize_payload(payload):

tests/unit/test_docparse_path.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
# limitations under the License.
1414

1515
from databend_aiserver.udfs.docparse import ai_parse_document
16+
import json
1617

1718

1819
def test_docparse_metadata_path_uses_root(memory_stage_with_root):
19-
payload = ai_parse_document(memory_stage_with_root, "2206.01062.pdf")
20+
raw = ai_parse_document(memory_stage_with_root, "2206.01062.pdf")
21+
payload = json.loads(raw) if isinstance(raw, str) else raw
2022
meta = payload.get("metadata", {})
2123
assert meta["path"] == "s3://wizardbend/dataset/data/2206.01062.pdf"
2224
assert meta["filename"] == "2206.01062.pdf"

0 commit comments

Comments
 (0)