@@ -614,6 +614,62 @@ def test_webdataset_writer_collapsed_text_preserves_element_metadata_json(tmp_pa
614614 assert json .loads (str (rows [1 ]["element_metadata_json" ]))["element_metadata_json" ]["lang" ] == "en"
615615
616616
617+ def test_webdataset_writer_collapsed_text_writes_full_segment_metadata_payload (tmp_path : Path ) -> None :
618+ out = tmp_path / "collapsed_text_full_metadata.tar"
619+ table = pa .table (
620+ {
621+ "sample_id" : ["doc" , "doc" , "doc" , "doc" ],
622+ "position" : [0 , 1 , 2 , 3 ],
623+ "modality" : ["text" , "text" , "text" , "image" ],
624+ "content_type" : ["text/plain" , "text/plain" , "text/plain" , "image/jpeg" ],
625+ "text_content" : ["alpha" , "beta" , "gamma" , None ],
626+ "binary_content" : [None , None , None , b"img" ],
627+ "element_metadata_json" : [
628+ '{"quality": 0.91, "token_count": 1}' ,
629+ '{"quality": 0.77, "lang": "en", "attrs": {"source": "ocr"}}' ,
630+ '{"quality": 0.55, "tags": ["x", "y"]}' ,
631+ None ,
632+ ],
633+ "source_id" : ["src" , "src" , "src" , "src" ],
634+ "source_shard" : ["shard" , "shard" , "shard" , "shard" ],
635+ "content_path" : [None , None , None , None ],
636+ "content_key" : [None , None , None , "doc.jpg" ],
637+ },
638+ schema = MULTIMODAL_SCHEMA ,
639+ )
640+ task = MultimodalBatch (task_id = "t-full-meta" , dataset_name = "ds" , data = table )
641+ result = MultimodalWriterStage (output_path = str (out ), output_format = "webdataset" ).process (task )
642+ names , members = _read_tar_members (Path (result .data [0 ]))
643+
644+ assert names == ["doc.000000.json" , "doc.000003.jpg" ]
645+ payload = json .loads (members ["doc.000000.json" ].decode ("utf-8" ))
646+ assert payload ["sample_id" ] == "doc"
647+ assert [segment ["text" ] for segment in payload ["segments" ]] == ["alpha" , "beta" , "gamma" ]
648+ assert payload ["segments" ][0 ]["element_metadata_json" ] == {"quality" : 0.91 , "token_count" : 1 }
649+ assert payload ["segments" ][1 ]["element_metadata_json" ] == {
650+ "quality" : 0.77 ,
651+ "lang" : "en" ,
652+ "attrs" : {"source" : "ocr" },
653+ }
654+ assert payload ["segments" ][2 ]["element_metadata_json" ] == {"quality" : 0.55 , "tags" : ["x" , "y" ]}
655+
656+ roundtrip = WebDatasetReaderStage (load_binary = False , sample_format = "auto" ).process (
657+ FileGroupTask (task_id = "rt-full-meta" , dataset_name = "ds" , data = [result .data [0 ]])
658+ )
659+ rows = sorted (
660+ [row for row in roundtrip .data .to_pylist () if row ["modality" ] == "text" ],
661+ key = lambda row : int (row ["position" ]),
662+ )
663+ assert [row ["text_content" ] for row in rows ] == ["alpha" , "beta" , "gamma" ]
664+ assert json .loads (str (rows [0 ]["element_metadata_json" ]))["element_metadata_json" ] == {"quality" : 0.91 , "token_count" : 1 }
665+ assert json .loads (str (rows [1 ]["element_metadata_json" ]))["element_metadata_json" ] == {
666+ "quality" : 0.77 ,
667+ "lang" : "en" ,
668+ "attrs" : {"source" : "ocr" },
669+ }
670+ assert json .loads (str (rows [2 ]["element_metadata_json" ]))["element_metadata_json" ] == {"quality" : 0.55 , "tags" : ["x" , "y" ]}
671+
672+
617673def test_webdataset_writer_allows_text_only_batch (tmp_path : Path ) -> None :
618674 out = tmp_path / "text-only.tar"
619675 table = pa .table (
0 commit comments