From db920efb87f543265f840287a899241f0439567d Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 14 Nov 2025 11:41:09 +0100 Subject: [PATCH 01/20] refactor: move WebVTT data model from docling Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 416 +++++++++++++++++++++++++ test/data/webvtt/webvtt_example_01.vtt | 42 +++ test/data/webvtt/webvtt_example_02.vtt | 15 + test/data/webvtt/webvtt_example_03.vtt | 57 ++++ test/test_webvtt.py | 199 ++++++++++++ 5 files changed, 729 insertions(+) create mode 100644 docling_core/types/doc/webvtt.py create mode 100644 test/data/webvtt/webvtt_example_01.vtt create mode 100644 test/data/webvtt/webvtt_example_02.vtt create mode 100644 test/data/webvtt/webvtt_example_03.vtt create mode 100644 test/test_webvtt.py diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py new file mode 100644 index 00000000..eccae4a6 --- /dev/null +++ b/docling_core/types/doc/webvtt.py @@ -0,0 +1,416 @@ +"""Models for the Docling's adoption of Web Video Text Tracks format.""" + +import logging +import re +from typing import Annotated, ClassVar, Literal, Optional, Union, cast + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic.types import StringConstraints +from typing_extensions import Self, override + +_log = logging.getLogger(__name__) + + +class _WebVTTTimestamp(BaseModel): + """Model representing a WebVTT timestamp. + + A WebVTT timestamp is always interpreted relative to the current playback position + of the media data that the WebVTT file is to be synchronized with. + """ + + model_config = ConfigDict(regex_engine="python-re") + + raw: Annotated[ + str, + Field( + description="A representation of the WebVTT Timestamp as a single string" + ), + ] + + _pattern: ClassVar[re.Pattern] = re.compile( + r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$" + ) + _hours: int + _minutes: int + _seconds: int + _millis: int + + @model_validator(mode="after") + def validate_raw(self) -> Self: + m = self._pattern.match(self.raw) + if not m: + raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}") + self._hours = int(m.group(1)) if m.group(1) else 0 + self._minutes = int(m.group(2)) + self._seconds = int(m.group(3)) + self._millis = int(m.group(4)) + + if self._minutes < 0 or self._minutes > 59: + raise ValueError("Minutes must be between 0 and 59") + if self._seconds < 0 or self._seconds > 59: + raise ValueError("Seconds must be between 0 and 59") + + return self + + @property + def seconds(self) -> float: + """A representation of the WebVTT Timestamp in seconds.""" + return ( + self._hours * 3600 + + self._minutes * 60 + + self._seconds + + self._millis / 1000.0 + ) + + @override + def __str__(self) -> str: + return self.raw + + +_WebVTTCueIdentifier = Annotated[ + str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") +] + + +class _WebVTTCueTimings(BaseModel): + """Model representating WebVTT cue timings.""" + + start: Annotated[ + _WebVTTTimestamp, Field(description="Start time offset of the cue") + ] + end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")] + + @model_validator(mode="after") + def check_order(self) -> Self: + if self.start and self.end: + if self.end.seconds <= self.start.seconds: + raise ValueError("End timestamp must be greater than start timestamp") + return self + + @override + def __str__(self): + return f"{self.start} --> {self.end}" + + +class _WebVTTCueTextSpan(BaseModel): + """Model representing a WebVTT cue text span.""" + + text: str + span_type: Literal["text"] = "text" + + @field_validator("text", mode="after") + @classmethod + def validate_text(cls, value: str) -> str: + if any(ch in value for ch in {"\n", "\r", "&", "<"}): + raise ValueError("Cue text span contains invalid characters") + if len(value) == 0: + raise ValueError("Cue text span cannot be empty") + return value + + @override + def __str__(self): + return self.text + + +class _WebVTTCueVoiceSpan(BaseModel): + """Model representing a WebVTT cue voice span.""" + + annotation: Annotated[ + str, + Field( + description=( + "Cue span start tag annotation text representing the name of thevoice" + ) + ), + ] + classes: Annotated[ + list[str], + Field(description="List of classes representing the cue span's significance"), + ] = [] + components: Annotated[ + list["_WebVTTCueComponent"], + Field(description="The components representing the cue internal text"), + ] = [] + span_type: Literal["v"] = "v" + + @field_validator("annotation", mode="after") + @classmethod + def validate_annotation(cls, value: str) -> str: + if any(ch in value for ch in {"\n", "\r", "&", ">"}): + raise ValueError( + "Cue span start tag annotation contains invalid characters" + ) + if not value: + raise ValueError("Cue text span cannot be empty") + return value + + @field_validator("classes", mode="after") + @classmethod + def validate_classes(cls, value: list[str]) -> list[str]: + for item in value: + if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): + raise ValueError( + "A cue span start tag class contains invalid characters" + ) + if not item: + raise ValueError("Cue span start tag classes cannot be empty") + return value + + @override + def __str__(self): + tag = f"v.{'.'.join(self.classes)}" if self.classes else "v" + inner = "".join(str(span) for span in self.components) + return f"<{tag} {self.annotation}>{inner}" + + +class _WebVTTCueClassSpan(BaseModel): + span_type: Literal["c"] = "c" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueItalicSpan(BaseModel): + span_type: Literal["i"] = "i" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueBoldSpan(BaseModel): + span_type: Literal["b"] = "b" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueUnderlineSpan(BaseModel): + span_type: Literal["u"] = "u" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +_WebVTTCueComponent = Annotated[ + Union[ + _WebVTTCueTextSpan, + _WebVTTCueClassSpan, + _WebVTTCueItalicSpan, + _WebVTTCueBoldSpan, + _WebVTTCueUnderlineSpan, + _WebVTTCueVoiceSpan, + ], + Field(discriminator="span_type", description="The WebVTT cue component"), +] + + +class _WebVTTCueBlock(BaseModel): + """Model representing a WebVTT cue block. + + The optional WebVTT cue settings list is not supported. + The cue payload is limited to the following spans: text, class, italic, bold, + underline, and voice. + """ + + model_config = ConfigDict(regex_engine="python-re") + + identifier: Optional[_WebVTTCueIdentifier] = Field( + None, description="The WebVTT cue identifier" + ) + timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] + payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")] + + _pattern_block: ClassVar[re.Pattern] = re.compile( + r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>" + ) + _pattern_voice_tag: ClassVar[re.Pattern] = re.compile( + r"^\.[^\t\n\r &<>]+)?" # zero or more classes + r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation + ) + + @field_validator("payload", mode="after") + @classmethod + def validate_payload(cls, payload): + for voice in payload: + if "-->" in str(voice): + raise ValueError("Cue payload must not contain '-->'") + return payload + + @classmethod + def parse(cls, raw: str) -> "_WebVTTCueBlock": + lines = raw.strip().splitlines() + if not lines: + raise ValueError("Cue block must have at least one line") + identifier: Optional[_WebVTTCueIdentifier] = None + timing_line = lines[0] + if "-->" not in timing_line and len(lines) > 1: + identifier = timing_line + timing_line = lines[1] + cue_lines = lines[2:] + else: + cue_lines = lines[1:] + + if "-->" not in timing_line: + raise ValueError("Cue block must contain WebVTT cue timings") + + start, end = [t.strip() for t in timing_line.split("-->")] + end = re.split(" |\t", end)[0] # ignore the cue settings list + timings: _WebVTTCueTimings = _WebVTTCueTimings( + start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end) + ) + cue_text = " ".join(cue_lines).strip() + if cue_text.startswith("" not in cue_text: + # adding close tag for cue voice spans without end tag + cue_text += "" + + stack: list[list[_WebVTTCueComponent]] = [[]] + tag_stack: list[Union[str, tuple]] = [] + + pos = 0 + matches = list(cls._pattern_block.finditer(cue_text)) + i = 0 + while i < len(matches): + match = matches[i] + if match.start() > pos: + stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) + tag = match.group(0) + + if tag.startswith(("", "", "", "")): + tag_type = tag[1:2] + tag_stack.append(tag_type) + stack.append([]) + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueItalicSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueBoldSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueUnderlineSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueClassSpan(components=children)) + tag_stack.pop() + elif tag.startswith("")) + else: + parts.append(str(span)) + + return "".join(parts) + + +class _WebVTTFile(BaseModel): + """A model representing a WebVTT file.""" + + cue_blocks: list[_WebVTTCueBlock] + + @staticmethod + def verify_signature(content: str) -> bool: + if not content: + return False + elif len(content) == 6: + return content == "WEBVTT" + elif len(content) > 6 and content.startswith("WEBVTT"): + return content[6] in (" ", "\t", "\n") + else: + return False + + @classmethod + def parse(cls, raw: str) -> "_WebVTTFile": + # Normalize newlines to LF + raw = raw.replace("\r\n", "\n").replace("\r", "\n") + + # Check WebVTT signature + if not cls.verify_signature(raw): + raise ValueError("Invalid WebVTT file signature") + + # Strip "WEBVTT" header line + lines = raw.split("\n", 1) + body = lines[1] if len(lines) > 1 else "" + + # Remove NOTE/STYLE/REGION blocks + body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE) + body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE) + + # Split into cue blocks + raw_blocks = re.split(r"\n\s*\n", body.strip()) + cues: list[_WebVTTCueBlock] = [] + for block in raw_blocks: + try: + cues.append(_WebVTTCueBlock.parse(block)) + except ValueError as e: + _log.warning(f"Failed to parse cue block:\n{block}\n{e}") + + return cls(cue_blocks=cues) + + def __iter__(self): + return iter(self.cue_blocks) + + def __getitem__(self, idx): + return self.cue_blocks[idx] + + def __len__(self): + return len(self.cue_blocks) diff --git a/test/data/webvtt/webvtt_example_01.vtt b/test/data/webvtt/webvtt_example_01.vtt new file mode 100644 index 00000000..333ca4a8 --- /dev/null +++ b/test/data/webvtt/webvtt_example_01.vtt @@ -0,0 +1,42 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:11.000 --> 00:13.000 +We are in New York City + +00:13.000 --> 00:16.000 +We’re actually at the Lucern Hotel, just down the street + +00:16.000 --> 00:18.000 +from the American Museum of Natural History + +00:18.000 --> 00:20.000 +And with me is Neil deGrasse Tyson + +00:20.000 --> 00:22.000 +Astrophysicist, Director of the Hayden Planetarium + +00:22.000 --> 00:24.000 +at the AMNH. + +00:24.000 --> 00:26.000 +Thank you for walking down here. + +00:27.000 --> 00:30.000 +And I want to do a follow-up on the last conversation we did. + +00:30.000 --> 00:31.500 align:right size:50% +When we e-mailed— + +00:30.500 --> 00:32.500 align:left size:50% +Didn’t we talk about enough in that conversation? + +00:32.000 --> 00:35.500 align:right size:50% +No! No no no no; 'cos 'cos obviously 'cos + +00:32.500 --> 00:33.500 align:left size:50% +Laughs + +00:35.500 --> 00:38.000 +You know I’m so excited my glasses are falling off here. diff --git a/test/data/webvtt/webvtt_example_02.vtt b/test/data/webvtt/webvtt_example_02.vtt new file mode 100644 index 00000000..1152a1e8 --- /dev/null +++ b/test/data/webvtt/webvtt_example_02.vtt @@ -0,0 +1,15 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:00.000 --> 00:02.000 +It’s a blue apple tree! + +00:02.000 --> 00:04.000 +No way! + +00:04.000 --> 00:06.000 +Hee! laughter + +00:06.000 --> 00:08.000 +That’s awesome! \ No newline at end of file diff --git a/test/data/webvtt/webvtt_example_03.vtt b/test/data/webvtt/webvtt_example_03.vtt new file mode 100644 index 00000000..a4dc1291 --- /dev/null +++ b/test/data/webvtt/webvtt_example_03.vtt @@ -0,0 +1,57 @@ +WEBVTT + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +00:00:04.963 --> 00:00:08.571 +OK, +I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 +00:00:08.571 --> 00:00:09.403 +properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 +00:00:10.683 --> 00:00:11.563 +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 +00:00:13.363 --> 00:00:13.803 +Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 +00:00:49.603 --> 00:00:53.363 +I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 +00:00:54.963 --> 00:01:02.072 +Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 +00:01:02.072 --> 00:01:06.811 +some metadata, +some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 +00:01:10.243 --> 00:01:13.014 +Yeah, +I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 +00:01:10.563 --> 00:01:12.643 +But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +00:01:13.014 --> 00:01:15.907 +than me. +I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +00:01:50.222 --> 00:01:51.643 +give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +00:01:52.043 --> 00:01:55.043 +Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +00:01:54.603 --> 00:01:55.283 +See you. \ No newline at end of file diff --git a/test/test_webvtt.py b/test/test_webvtt.py new file mode 100644 index 00000000..75f5dfc1 --- /dev/null +++ b/test/test_webvtt.py @@ -0,0 +1,199 @@ +# Assisted by watsonx Code Assistant + + +import pytest +from pydantic import ValidationError + +from docling_core.types.doc.webvtt import ( + _WebVTTCueItalicSpan, + _WebVTTCueTextSpan, + _WebVTTCueTimings, + _WebVTTCueVoiceSpan, + _WebVTTFile, + _WebVTTTimestamp, +) + +from .test_data_gen_flag import GEN_TEST_DATA + +GENERATE = GEN_TEST_DATA + + +def test_vtt_cue_commponents(): + """Test WebVTT components.""" + valid_timestamps = [ + "00:01:02.345", + "12:34:56.789", + "02:34.567", + "00:00:00.000", + ] + valid_total_seconds = [ + 1 * 60 + 2.345, + 12 * 3600 + 34 * 60 + 56.789, + 2 * 60 + 34.567, + 0.0, + ] + for idx, ts in enumerate(valid_timestamps): + model = _WebVTTTimestamp(raw=ts) + assert model.seconds == valid_total_seconds[idx] + + """Test invalid WebVTT timestamps.""" + invalid_timestamps = [ + "00:60:02.345", # minutes > 59 + "00:01:60.345", # seconds > 59 + "00:01:02.1000", # milliseconds > 999 + "01:02:03", # missing milliseconds + "01:02", # missing milliseconds + ":01:02.345", # extra : for missing hours + "abc:01:02.345", # invalid format + ] + for ts in invalid_timestamps: + with pytest.raises(ValidationError): + _WebVTTTimestamp(raw=ts) + + """Test the timestamp __str__ method.""" + model = _WebVTTTimestamp(raw="00:01:02.345") + assert str(model) == "00:01:02.345" + + """Test valid cue timings.""" + start = _WebVTTTimestamp(raw="00:10.005") + end = _WebVTTTimestamp(raw="00:14.007") + cue_timings = _WebVTTCueTimings(start=start, end=end) + assert cue_timings.start == start + assert cue_timings.end == end + assert str(cue_timings) == "00:10.005 --> 00:14.007" + + """Test invalid cue timings with end timestamp before start.""" + start = _WebVTTTimestamp(raw="00:10.700") + end = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(start=start, end=end) + assert "End timestamp must be greater than start timestamp" in str(excinfo.value) + + """Test invalid cue timings with missing end.""" + start = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(start=start) + assert "Field required" in str(excinfo.value) + + """Test invalid cue timings with missing start.""" + end = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(end=end) + assert "Field required" in str(excinfo.value) + + """Test with valid text.""" + valid_text = "This is a valid cue text span." + span = _WebVTTCueTextSpan(text=valid_text) + assert span.text == valid_text + assert str(span) == valid_text + + """Test with text containing newline characters.""" + invalid_text = "This cue text span\ncontains a newline." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with text containing ampersand.""" + invalid_text = "This cue text span contains &." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with text containing less-than sign.""" + invalid_text = "This cue text span contains <." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with empty text.""" + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text="") + + """Test that annotation validation works correctly.""" + valid_annotation = "valid-annotation" + invalid_annotation = "invalid\nannotation" + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=invalid_annotation) + assert _WebVTTCueVoiceSpan(annotation=valid_annotation) + + """Test that classes validation works correctly.""" + annotation = "speaker name" + valid_classes = ["class1", "class2"] + invalid_classes = ["class\nwith\nnewlines", ""] + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes) + assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes) + + """Test that components validation works correctly.""" + annotation = "speaker name" + valid_components = [_WebVTTCueTextSpan(text="random text")] + invalid_components = [123, "not a component"] + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components) + assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components) + + """Test valid cue voice spans.""" + cue_span = _WebVTTCueVoiceSpan( + annotation="speaker", + classes=["loud", "clear"], + components=[_WebVTTCueTextSpan(text="random text")], + ) + + expected_str = "random text" + assert str(cue_span) == expected_str + + cue_span = _WebVTTCueVoiceSpan( + annotation="speaker", + components=[_WebVTTCueTextSpan(text="random text")], + ) + expected_str = "random text" + assert str(cue_span) == expected_str + + +def test_webvtt_file(): + """Test WebVTT files.""" + with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 13 + block = vtt.cue_blocks[11] + assert str(block.timings) == "00:32.500 --> 00:33.500" + assert len(block.payload) == 1 + cue_span = block.payload[0] + assert isinstance(cue_span, _WebVTTCueVoiceSpan) + assert cue_span.annotation == "Neil deGrasse Tyson" + assert not cue_span.classes + assert len(cue_span.components) == 1 + comp = cue_span.components[0] + assert isinstance(comp, _WebVTTCueItalicSpan) + assert len(comp.components) == 1 + comp2 = comp.components[0] + assert isinstance(comp2, _WebVTTCueTextSpan) + assert comp2.text == "Laughs" + + with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 4 + reverse = ( + "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " + "https://www.w3.org/TR/webvtt1/\n\n" + ) + reverse += "\n\n".join([str(block) for block in vtt.cue_blocks]) + assert content == reverse + + with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 13 + for block in vtt: + assert block.identifier + block = vtt.cue_blocks[0] + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" + assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], _WebVTTCueVoiceSpan) + block = vtt.cue_blocks[2] + assert isinstance(cue_span, _WebVTTCueVoiceSpan) + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], _WebVTTCueTextSpan) + assert block.payload[0].text == "Good." From 2e9663e1ef8abdbcb70496b40cafa6218d164d25 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 14 Nov 2025 14:53:05 +0100 Subject: [PATCH 02/20] fix(webvtt): deal with HTML entities in cue text spans Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 15 ++++++++++++++- test/test_webvtt.py | 6 ++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index eccae4a6..d7cabdc3 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -98,10 +98,23 @@ class _WebVTTCueTextSpan(BaseModel): text: str span_type: Literal["text"] = "text" + _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} + _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);") + @field_validator("text", mode="after") @classmethod def validate_text(cls, value: str) -> str: - if any(ch in value for ch in {"\n", "\r", "&", "<"}): + for match in cls._entity_pattern.finditer(value): + entity = match.group(1) + if entity not in cls._valid_entities: + raise ValueError( + f"Cue text span contains an invalid HTML entity: &{entity};" + ) + if "&" in re.sub(cls._entity_pattern, "", value): + raise ValueError( + "Found '&' not part of a valid entity in the cue text span" + ) + if any(ch in value for ch in {"\n", "\r", "<"}): raise ValueError("Cue text span contains invalid characters") if len(value) == 0: raise ValueError("Cue text span cannot be empty") diff --git a/test/test_webvtt.py b/test/test_webvtt.py index 75f5dfc1..ea4f2889 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -96,6 +96,12 @@ def test_vtt_cue_commponents(): invalid_text = "This cue text span contains &." with pytest.raises(ValidationError): _WebVTTCueTextSpan(text=invalid_text) + invalid_text = "An invalid &foo; entity" + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + valid_text = "My favorite book is Pride & Prejudice" + span = _WebVTTCueTextSpan(text=valid_text) + assert span.text == valid_text """Test with text containing less-than sign.""" invalid_text = "This cue text span contains <." From ea303dbe46f437274d984e4d06769f7734c375cb Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Mon, 17 Nov 2025 03:32:05 +0100 Subject: [PATCH 03/20] refactor(webvtt): support more WebVTT models Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 367 +++++++++++++++++++------------ test/test_webvtt.py | 137 +++++++++--- 2 files changed, 332 insertions(+), 172 deletions(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index d7cabdc3..6d60a2d8 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -2,7 +2,8 @@ import logging import re -from typing import Annotated, ClassVar, Literal, Optional, Union, cast +from enum import Enum +from typing import Annotated, ClassVar, Literal, Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic.types import StringConstraints @@ -11,8 +12,24 @@ _log = logging.getLogger(__name__) +_VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} +_ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);") +_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] + + +class _WebVTTLineTerminator(str, Enum): + CRLF = "\r\n" + LF = "\n" + CR = "\r" + + +_WebVTTCueIdentifier = Annotated[ + str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") +] + + class _WebVTTTimestamp(BaseModel): - """Model representing a WebVTT timestamp. + """WebVTT timestamp. A WebVTT timestamp is always interpreted relative to the current playback position of the media data that the WebVTT file is to be synchronized with. @@ -67,13 +84,8 @@ def __str__(self) -> str: return self.raw -_WebVTTCueIdentifier = Annotated[ - str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") -] - - class _WebVTTCueTimings(BaseModel): - """Model representating WebVTT cue timings.""" + """WebVTT cue timings.""" start: Annotated[ _WebVTTTimestamp, Field(description="Start time offset of the cue") @@ -93,31 +105,27 @@ def __str__(self): class _WebVTTCueTextSpan(BaseModel): - """Model representing a WebVTT cue text span.""" + """WebVTT cue text span.""" - text: str - span_type: Literal["text"] = "text" - - _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} - _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);") + kind: Literal["text"] = "text" + text: Annotated[str, Field(description="The cue text.")] @field_validator("text", mode="after") @classmethod - def validate_text(cls, value: str) -> str: - for match in cls._entity_pattern.finditer(value): + def is_valid_text(cls, value: str) -> str: + for match in _ENTITY_PATTERN.finditer(value): entity = match.group(1) - if entity not in cls._valid_entities: + if entity not in _VALID_ENTITIES: raise ValueError( - f"Cue text span contains an invalid HTML entity: &{entity};" + f"Cue text contains an invalid HTML entity: &{entity};" ) - if "&" in re.sub(cls._entity_pattern, "", value): - raise ValueError( - "Found '&' not part of a valid entity in the cue text span" - ) + if "&" in re.sub(_ENTITY_PATTERN, "", value): + raise ValueError("Found '&' not part of a valid entity in the cue text") if any(ch in value for ch in {"\n", "\r", "<"}): - raise ValueError("Cue text span contains invalid characters") + raise ValueError("Cue text contains invalid characters") if len(value) == 0: - raise ValueError("Cue text span cannot be empty") + raise ValueError("Cue text cannot be empty") + return value @override @@ -125,37 +133,48 @@ def __str__(self): return self.text -class _WebVTTCueVoiceSpan(BaseModel): - """Model representing a WebVTT cue voice span.""" +class _WebVTTCueComponentWithTerminator(BaseModel): + """WebVTT caption or subtitle cue component optionally with a line terminator.""" - annotation: Annotated[ - str, + component: "_WebVTTCueComponent" + terminator: Optional[_WebVTTLineTerminator] = None + + @override + def __str__(self): + return f"{self.component}{self.terminator.value if self.terminator else ''}" + + +class _WebVTTCueInternalText(BaseModel): + """WebVTT cue internal text.""" + + terminator: Optional[_WebVTTLineTerminator] = None + components: Annotated[ + list[_WebVTTCueComponentWithTerminator], Field( description=( - "Cue span start tag annotation text representing the name of thevoice" + "WebVTT caption or subtitle cue components representing the " + "cue internal text" ) ), - ] + ] = [] + + @override + def __str__(self): + cue_str = ( + f"{self.terminator.value if self.terminator else ''}" + f"{''.join(str(span) for span in self.components)}" + ) + return cue_str + + +class _WebVTTCueSpanStartTag(BaseModel): + """WebVTT cue span start tag.""" + + name: Annotated[_START_TAG_NAMES, Field(description="The tag name")] classes: Annotated[ list[str], Field(description="List of classes representing the cue span's significance"), ] = [] - components: Annotated[ - list["_WebVTTCueComponent"], - Field(description="The components representing the cue internal text"), - ] = [] - span_type: Literal["v"] = "v" - - @field_validator("annotation", mode="after") - @classmethod - def validate_annotation(cls, value: str) -> str: - if any(ch in value for ch in {"\n", "\r", "&", ">"}): - raise ValueError( - "Cue span start tag annotation contains invalid characters" - ) - if not value: - raise ValueError("Cue text span cannot be empty") - return value @field_validator("classes", mode="after") @classmethod @@ -169,51 +188,113 @@ def validate_classes(cls, value: list[str]) -> list[str]: raise ValueError("Cue span start tag classes cannot be empty") return value + def _get_name_with_classes(self) -> str: + return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name + @override def __str__(self): - tag = f"v.{'.'.join(self.classes)}" if self.classes else "v" - inner = "".join(str(span) for span in self.components) - return f"<{tag} {self.annotation}>{inner}" + return f"<{self._get_name_with_classes()}>" -class _WebVTTCueClassSpan(BaseModel): - span_type: Literal["c"] = "c" - components: list["_WebVTTCueComponent"] +class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag): + """WebVTT cue span start tag requiring an annotation.""" - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" + annotation: Annotated[str, Field(description="Cue span start tag annotation")] + @field_validator("annotation", mode="after") + @classmethod + def is_valid_annotation(cls, value: str) -> str: + for match in _ENTITY_PATTERN.finditer(value): + entity = match.group(1) + if entity not in _VALID_ENTITIES: + raise ValueError( + f"Annotation contains an invalid HTML entity: &{entity};" + ) + if "&" in re.sub(_ENTITY_PATTERN, "", value): + raise ValueError("Found '&' not part of a valid entity in annotation") + if any(ch in value for ch in {"\n", "\r", ">"}): + raise ValueError("Annotation contains invalid characters") + if len(value) == 0: + raise ValueError("Annotation cannot be empty") -class _WebVTTCueItalicSpan(BaseModel): - span_type: Literal["i"] = "i" - components: list["_WebVTTCueComponent"] + return value @override def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" + return f"<{self._get_name_with_classes()} {self.annotation}>" -class _WebVTTCueBoldSpan(BaseModel): - span_type: Literal["b"] = "b" - components: list["_WebVTTCueComponent"] +class _WebVTTCueComponentBase(BaseModel): + """WebVTT caption or subtitle cue component. - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" + All the WebVTT caption or subtitle cue components are represented by this class + except the WebVTT cue text span, which requires different definitions. + """ + kind: Literal["c", "b", "i", "u", "v", "lang"] + start_tag: _WebVTTCueSpanStartTag + internal_text: _WebVTTCueInternalText -class _WebVTTCueUnderlineSpan(BaseModel): - span_type: Literal["u"] = "u" - components: list["_WebVTTCueComponent"] + @model_validator(mode="after") + def check_tag_names_match(self) -> Self: + if self.kind != self.start_tag.name: + raise ValueError("The tag name of this cue component should be {self.kind}") + return self @override def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" + return f"{self.start_tag}{self.internal_text}" + + +class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase): + """WebVTT cue voice span associated with a specific voice.""" + + kind: Literal["v"] = "v" + start_tag: _WebVTTCueSpanStartTagAnnotated + + +class _WebVTTCueClassSpan(_WebVTTCueComponentBase): + """WebVTT cue class span. + + It represents a span of text and it is used to annotate parts of the cue with + applicable classes without implying further meaning (such as italics or bold). + """ + + kind: Literal["c"] = "c" + start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c") + + +class _WebVTTCueItalicSpan(_WebVTTCueComponentBase): + """WebVTT cue italic span representing a span of italic text.""" + + kind: Literal["i"] = "i" + start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i") + + +class _WebVTTCueBoldSpan(_WebVTTCueComponentBase): + """WebVTT cue bold span representing a span of bold text.""" + + kind: Literal["b"] = "b" + start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b") + + +class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase): + """WebVTT cue underline span representing a span of underline text.""" + + kind: Literal["u"] = "u" + start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u") + + +class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase): + """WebVTT cue language span. + + It represents a span of text and it is used to annotate parts of the cue where the + applicable language might be different than the surrounding text's, without + implying further meaning (such as italics or bold). + """ + + kind: Literal["lang"] = "lang" + start_tag: _WebVTTCueSpanStartTagAnnotated _WebVTTCueComponent = Annotated[ @@ -224,8 +305,12 @@ def __str__(self): _WebVTTCueBoldSpan, _WebVTTCueUnderlineSpan, _WebVTTCueVoiceSpan, + _WebVTTCueLanguageSpan, ], - Field(discriminator="span_type", description="The WebVTT cue component"), + Field( + discriminator="kind", + description="The type of WebVTT caption or subtitle cue component.", + ), ] @@ -243,14 +328,17 @@ class _WebVTTCueBlock(BaseModel): None, description="The WebVTT cue identifier" ) timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] - payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")] + payload: Annotated[ + list[_WebVTTCueComponentWithTerminator], + Field(description="The WebVTT caption or subtitle cue text"), + ] - _pattern_block: ClassVar[re.Pattern] = re.compile( - r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>" - ) - _pattern_voice_tag: ClassVar[re.Pattern] = re.compile( - r"^\.[^\t\n\r &<>]+)?" # zero or more classes - r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation + # pattern of a WebVTT cue span start/end tag + _pattern_tag: ClassVar[re.Pattern] = re.compile( + r"<(?P/?)" + r"(?Pi|b|c|u|v|lang)" + r"(?P(?:\.[^\t\n\r &<>.]+)*)" + r"(?:[ \t](?P[^\n\r&>]*))?>" ) @field_validator("payload", mode="after") @@ -284,74 +372,77 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end) ) cue_text = " ".join(cue_lines).strip() - if cue_text.startswith("" not in cue_text: - # adding close tag for cue voice spans without end tag - cue_text += "" + # adding close tag for cue spans without end tag + for omm in {"v"}: + if cue_text.startswith(f"<{omm}") and f"" not in cue_text: + cue_text += f"" + break - stack: list[list[_WebVTTCueComponent]] = [[]] - tag_stack: list[Union[str, tuple]] = [] + stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]] + tag_stack: list[dict] = [] pos = 0 - matches = list(cls._pattern_block.finditer(cue_text)) + matches = list(cls._pattern_tag.finditer(cue_text)) i = 0 while i < len(matches): match = matches[i] if match.start() > pos: - stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) - tag = match.group(0) - - if tag.startswith(("", "", "", "")): - tag_type = tag[1:2] - tag_stack.append(tag_type) - stack.append([]) - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueItalicSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueBoldSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueUnderlineSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueClassSpan(components=children)) - tag_stack.pop() - elif tag.startswith("")) else: parts.append(str(span)) - return "".join(parts) + return "".join(parts) + "\n" class _WebVTTFile(BaseModel): diff --git a/test/test_webvtt.py b/test/test_webvtt.py index ea4f2889..b4d408cb 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -1,11 +1,20 @@ -# Assisted by watsonx Code Assistant +"""Test the data model for WebVTT files. +Assisted by watsonx Code Assistant. +Examples extracted from https://www.w3.org/TR/webvtt1/ +Copyright © 2019 World Wide Web Consortium. +""" import pytest from pydantic import ValidationError from docling_core.types.doc.webvtt import ( + _WebVTTCueBlock, + _WebVTTCueComponentWithTerminator, + _WebVTTCueInternalText, _WebVTTCueItalicSpan, + _WebVTTCueLanguageSpan, + _WebVTTCueSpanStartTagAnnotated, _WebVTTCueTextSpan, _WebVTTCueTimings, _WebVTTCueVoiceSpan, @@ -18,7 +27,7 @@ GENERATE = GEN_TEST_DATA -def test_vtt_cue_commponents(): +def test_vtt_cue_commponents() -> None: """Test WebVTT components.""" valid_timestamps = [ "00:01:02.345", @@ -72,13 +81,13 @@ def test_vtt_cue_commponents(): """Test invalid cue timings with missing end.""" start = _WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(start=start) + _WebVTTCueTimings(start=start) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) """Test invalid cue timings with missing start.""" end = _WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(end=end) + _WebVTTCueTimings(end=end) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) """Test with valid text.""" @@ -116,44 +125,105 @@ def test_vtt_cue_commponents(): valid_annotation = "valid-annotation" invalid_annotation = "invalid\nannotation" with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=invalid_annotation) - assert _WebVTTCueVoiceSpan(annotation=valid_annotation) + _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation) + assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation) """Test that classes validation works correctly.""" annotation = "speaker name" valid_classes = ["class1", "class2"] invalid_classes = ["class\nwith\nnewlines", ""] with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes) - assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes) + _WebVTTCueSpanStartTagAnnotated( + name="v", annotation=annotation, classes=invalid_classes + ) + assert _WebVTTCueSpanStartTagAnnotated( + name="v", annotation=annotation, classes=valid_classes + ) """Test that components validation works correctly.""" annotation = "speaker name" - valid_components = [_WebVTTCueTextSpan(text="random text")] + valid_components = [ + _WebVTTCueComponentWithTerminator( + component=_WebVTTCueTextSpan(text="random text") + ) + ] invalid_components = [123, "not a component"] with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components) - assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components) + _WebVTTCueInternalText(components=invalid_components) + assert _WebVTTCueInternalText(components=valid_components) """Test valid cue voice spans.""" cue_span = _WebVTTCueVoiceSpan( - annotation="speaker", - classes=["loud", "clear"], - components=[_WebVTTCueTextSpan(text="random text")], + start_tag=_WebVTTCueSpanStartTagAnnotated( + name="v", annotation="speaker", classes=["loud", "clear"] + ), + internal_text=_WebVTTCueInternalText( + components=[ + _WebVTTCueComponentWithTerminator( + component=_WebVTTCueTextSpan(text="random text") + ) + ] + ), ) - expected_str = "random text" assert str(cue_span) == expected_str cue_span = _WebVTTCueVoiceSpan( - annotation="speaker", - components=[_WebVTTCueTextSpan(text="random text")], + start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"), + internal_text=_WebVTTCueInternalText( + components=[ + _WebVTTCueComponentWithTerminator( + component=_WebVTTCueTextSpan(text="random text") + ) + ] + ), ) expected_str = "random text" assert str(cue_span) == expected_str -def test_webvtt_file(): +def test_webvttcueblock_parse() -> None: + """Test the method parse of _WebVTTCueBlock class.""" + raw: str = ( + "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n" + ) + block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw) + assert str(block.timings) == "04:02.500 --> 04:05.000" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert ( + block.payload[0].component.text + == "J’ai commencé le basket à l'âge de 13, 14 ans" + ) + assert raw == str(block) + + raw = ( + "04:05.001 --> 04:07.800\n" + "Sur les playground, ici à Montpellier\n" + ) + block = _WebVTTCueBlock.parse(raw) + assert str(block.timings) == "04:05.001 --> 04:07.800" + assert len(block.payload) == 3 + assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert block.payload[0].component.text == "Sur les " + assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan) + assert len(block.payload[1].component.internal_text.components) == 1 + lang_span = block.payload[1].component.internal_text.components[0].component + assert isinstance(lang_span, _WebVTTCueLanguageSpan) + assert isinstance( + lang_span.internal_text.components[0].component, _WebVTTCueTextSpan + ) + assert lang_span.internal_text.components[0].component.text == "playground" + assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[2].component, _WebVTTCueTextSpan) + assert block.payload[2].component.text == ", ici à Montpellier" + assert raw == str(block) + + +def test_webvtt_file() -> None: """Test WebVTT files.""" with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: content = f.read() @@ -163,16 +233,16 @@ def test_webvtt_file(): assert str(block.timings) == "00:32.500 --> 00:33.500" assert len(block.payload) == 1 cue_span = block.payload[0] - assert isinstance(cue_span, _WebVTTCueVoiceSpan) - assert cue_span.annotation == "Neil deGrasse Tyson" - assert not cue_span.classes - assert len(cue_span.components) == 1 - comp = cue_span.components[0] - assert isinstance(comp, _WebVTTCueItalicSpan) - assert len(comp.components) == 1 - comp2 = comp.components[0] - assert isinstance(comp2, _WebVTTCueTextSpan) - assert comp2.text == "Laughs" + assert isinstance(cue_span.component, _WebVTTCueVoiceSpan) + assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson" + assert not cue_span.component.start_tag.classes + assert len(cue_span.component.internal_text.components) == 1 + comp = cue_span.component.internal_text.components[0] + assert isinstance(comp.component, _WebVTTCueItalicSpan) + assert len(comp.component.internal_text.components) == 1 + comp2 = comp.component.internal_text.components[0] + assert isinstance(comp2.component, _WebVTTCueTextSpan) + assert comp2.component.text == "Laughs" with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: content = f.read() @@ -182,8 +252,8 @@ def test_webvtt_file(): "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " "https://www.w3.org/TR/webvtt1/\n\n" ) - reverse += "\n\n".join([str(block) for block in vtt.cue_blocks]) - assert content == reverse + reverse += "\n".join([str(block) for block in vtt.cue_blocks]) + assert content == reverse.rstrip() with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: content = f.read() @@ -195,11 +265,10 @@ def test_webvtt_file(): assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" assert len(block.payload) == 1 - assert isinstance(block.payload[0], _WebVTTCueVoiceSpan) + assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan) block = vtt.cue_blocks[2] - assert isinstance(cue_span, _WebVTTCueVoiceSpan) assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" assert len(block.payload) == 1 - assert isinstance(block.payload[0], _WebVTTCueTextSpan) - assert block.payload[0].text == "Good." + assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert block.payload[0].component.text == "Good." From 0122141cfc50d4e8448a8a2e5f427f440eb06899 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Thu, 27 Nov 2025 18:58:35 +0100 Subject: [PATCH 04/20] refactor(DoclingDocument): create a new provenance model for media file types Signed-off-by: Cesar Berrospi Ramis --- docling_core/transforms/serializer/azure.py | 17 +- docling_core/transforms/serializer/common.py | 20 +- docling_core/transforms/serializer/doctags.py | 14 +- .../visualizer/key_value_visualizer.py | 13 +- .../visualizer/layout_visualizer.py | 16 +- .../visualizer/reading_order_visualizer.py | 3 +- .../transforms/visualizer/table_visualizer.py | 11 +- docling_core/types/doc/__init__.py | 1 + docling_core/types/doc/document.py | 196 ++++++++++----- docling_core/types/doc/webvtt.py | 73 ++---- docling_core/utils/legacy.py | 8 +- docs/DoclingDocument.json | 229 ++++++++++++++++-- 12 files changed, 439 insertions(+), 162 deletions(-) diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py index 385aca6a..ed91aee2 100644 --- a/docling_core/transforms/serializer/azure.py +++ b/docling_core/transforms/serializer/azure.py @@ -44,9 +44,10 @@ DocSerializer, create_ser_result, ) -from docling_core.types.doc.base import CoordOrigin -from docling_core.types.doc.document import ( +from docling_core.types.doc import ( + CoordOrigin, DocItem, + DocItemLabel, DoclingDocument, FormItem, InlineGroup, @@ -54,12 +55,12 @@ ListGroup, NodeItem, PictureItem, + ProvenanceItem, RefItem, RichTableCell, TableItem, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel def _bbox_to_polygon_coords( @@ -76,7 +77,7 @@ def _bbox_to_polygon_coords( def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]: """Compute a TOPLEFT-origin polygon for the first provenance of the item.""" - if not item.prov: + if not item.prov or not isinstance(item.prov[0], ProvenanceItem): return None prov = item.prov[0] @@ -187,7 +188,7 @@ def serialize( # Lists may be represented either as TextItem(ListItem) or via groups; # we treat any TextItem as a paragraph-like entry. - if item.prov: + if item.prov and isinstance(item.prov[0], ProvenanceItem): prov = item.prov[0] page_no = prov.page_no polygon = _bbox_to_polygon_for_item(doc, item) @@ -237,7 +238,7 @@ def serialize( ) -> SerializationResult: assert isinstance(doc_serializer, AzureDocSerializer) - if not item.prov: + if not item.prov or not isinstance(item.prov[0], ProvenanceItem): return create_ser_result() prov = item.prov[0] @@ -308,7 +309,7 @@ def serialize( ) -> SerializationResult: assert isinstance(doc_serializer, AzureDocSerializer) - if not item.prov: + if not item.prov or not isinstance(item.prov[0], ProvenanceItem): return create_ser_result() prov = item.prov[0] @@ -324,7 +325,7 @@ def serialize( for foot_ref in item.footnotes: if isinstance(foot_ref, RefItem): tgt = foot_ref.resolve(doc) - if isinstance(tgt, TextItem) and tgt.prov: + if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem): f_poly = _bbox_to_polygon_for_item(doc, tgt) if f_poly is not None: foots.append( diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 3a8ad71c..c9c497f4 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -35,11 +35,11 @@ SerializationResult, Span, ) -from docling_core.types.doc.document import ( - DOCUMENT_TOKENS_EXPORT_LABELS, +from docling_core.types.doc import ( ContentLayer, DescriptionAnnotation, DocItem, + DocItemLabel, DoclingDocument, FloatingItem, Formatting, @@ -52,12 +52,13 @@ PictureDataType, PictureItem, PictureMoleculeData, + ProvenanceItem, Script, TableAnnotationType, TableItem, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS _DEFAULT_LAYERS = set(ContentLayer) @@ -108,7 +109,7 @@ def _iterate_items( add_page_breaks=add_page_breaks, visited=my_visited, ): - if isinstance(it, DocItem) and it.prov: + if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem): page_no = it.prov[0].page_no if prev_page_nr is not None and page_no > prev_page_nr: yield ( @@ -120,7 +121,7 @@ def _iterate_items( lvl, ) break - elif isinstance(item, DocItem) and item.prov: + elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem): page_no = item.prov[0].page_no if prev_page_nr is None or page_no > prev_page_nr: if prev_page_nr is not None: # close previous range @@ -301,7 +302,13 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: or item.content_layer not in params.layers or ( params.pages is not None - and ((not item.prov) or item.prov[0].page_no not in params.pages) + and ( + (not item.prov) + or ( + isinstance(item.prov[0], ProvenanceItem) + and item.prov[0].page_no not in params.pages + ) + ) ) ) ) @@ -671,6 +678,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]: if ( isinstance(item, DocItem) and item.prov + and isinstance(item.prov[0], ProvenanceItem) and (self.params.pages is None or item.prov[0].page_no in self.params.pages) and ix >= self.params.start_idx and ix < self.params.stop_idx diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index e5672638..16549652 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -26,11 +26,13 @@ _should_use_legacy_annotations, create_ser_result, ) -from docling_core.types.doc.base import BoundingBox from docling_core.types.doc.document import ( + BoundingBox, CodeItem, DocItem, + DocItemLabel, DoclingDocument, + DocumentToken, FloatingItem, FormItem, GroupItem, @@ -40,6 +42,7 @@ ListItem, NodeItem, PictureClassificationData, + PictureClassificationLabel, PictureItem, PictureMoleculeData, PictureTabularChartData, @@ -47,10 +50,9 @@ SectionHeaderItem, TableData, TableItem, + TableToken, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel -from docling_core.types.doc.tokens import DocumentToken, TableToken def _wrap(text: str, wrap_tag: str) -> str: @@ -343,7 +345,7 @@ def serialize( results: list[SerializationResult] = [] page_no = 1 - if len(item.prov) > 0: + if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem): page_no = item.prov[0].page_no if params.add_location: @@ -361,7 +363,7 @@ def serialize( for cell in item.graph.cells: cell_txt = "" - if cell.prov is not None: + if cell.prov is not None and isinstance(cell.prov, ProvenanceItem): if len(doc.pages.keys()): page_w, page_h = doc.pages[page_no].size.as_tuple() cell_txt += DocumentToken.get_location( @@ -469,7 +471,7 @@ def _get_inline_location_tags( doc_items: list[DocItem] = [] for it, _ in doc.iterate_items(root=item): if isinstance(it, DocItem): - for prov in it.prov: + for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)): boxes.append(prov.bbox) doc_items.append(it) if prov is None: diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py index 5ed7b843..e2b10264 100644 --- a/docling_core/transforms/visualizer/key_value_visualizer.py +++ b/docling_core/transforms/visualizer/key_value_visualizer.py @@ -16,8 +16,13 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc.document import ContentLayer, DoclingDocument -from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel +from docling_core.types.doc import ( + ContentLayer, + DoclingDocument, + GraphCellLabel, + GraphLinkLabel, + ProvenanceItem, +) # --------------------------------------------------------------------------- # Helper functions / constants @@ -82,7 +87,7 @@ def _draw_key_value_layer( # First draw cells (rectangles + optional labels) # ------------------------------------------------------------------ for cell in cell_dict.values(): - if cell.prov is None or cell.prov.page_no != page_no: + if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no: continue # skip cells not on this page or without bbox tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height) @@ -149,6 +154,8 @@ def _draw_key_value_layer( if ( src_cell.prov is None or tgt_cell.prov is None + or not isinstance(src_cell.prov, ProvenanceItem) + or not isinstance(tgt_cell.prov, ProvenanceItem) or src_cell.prov.page_no != page_no or tgt_cell.prov.page_no != page_no ): diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py index 369a7b38..8ac6bf81 100644 --- a/docling_core/transforms/visualizer/layout_visualizer.py +++ b/docling_core/transforms/visualizer/layout_visualizer.py @@ -10,10 +10,16 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc import DocItemLabel -from docling_core.types.doc.base import CoordOrigin -from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument -from docling_core.types.doc.page import BoundingRectangle, TextCell +from docling_core.types.doc import ( + BoundingRectangle, + ContentLayer, + CoordOrigin, + DocItem, + DocItemLabel, + DoclingDocument, + ProvenanceItem, + TextCell, +) class _TLBoundingRectangle(BoundingRectangle): @@ -173,7 +179,7 @@ def _draw_doc_layout( if len(elem.prov) == 0: continue # Skip elements without provenances - for prov in elem.prov: + for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)): page_nr = prov.page_no if page_nr in my_images: diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py index 60874333..27583613 100644 --- a/docling_core/transforms/visualizer/reading_order_visualizer.py +++ b/docling_core/transforms/visualizer/reading_order_visualizer.py @@ -14,6 +14,7 @@ DocItem, DoclingDocument, PictureItem, + ProvenanceItem, ) @@ -130,7 +131,7 @@ def _draw_doc_reading_order( if len(elem.prov) == 0: continue # Skip elements without provenances - for prov in elem.prov: + for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)): page_no = prov.page_no image = my_images.get(page_no) diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py index 489a6d9a..d3790d6b 100644 --- a/docling_core/transforms/visualizer/table_visualizer.py +++ b/docling_core/transforms/visualizer/table_visualizer.py @@ -10,7 +10,12 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem +from docling_core.types.doc import ( + ContentLayer, + DoclingDocument, + ProvenanceItem, + TableItem, +) _log = logging.getLogger(__name__) @@ -185,10 +190,10 @@ def _draw_doc_tables( image = pil_img.copy() my_images[page_nr] = image - for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)): + for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)): if not isinstance(elem, TableItem): continue - if len(elem.prov) == 0: + if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem): continue # Skip elements without provenances if len(elem.prov) == 1: diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index 3c699f89..f0e0e92d 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -61,6 +61,7 @@ Script, SectionHeaderItem, SummaryMetaField, + TableAnnotationType, TableCell, TableData, TableItem, diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 0ecc3e51..e071b2b9 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -32,10 +32,12 @@ AnyUrl, BaseModel, ConfigDict, + Discriminator, Field, FieldSerializationInfo, SerializerFunctionWrapHandler, StringConstraints, + Tag, computed_field, field_serializer, field_validator, @@ -65,6 +67,7 @@ ) from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import parse_otsl_table_content, relative_path +from docling_core.types.doc.webvtt import _WebVTTTimestamp _logger = logging.getLogger(__name__) @@ -1155,11 +1158,81 @@ def from_multipage_doctags_and_images( class ProvenanceItem(BaseModel): - """ProvenanceItem.""" + """Provenance information for elements extracted from a textual document. - page_no: int - bbox: BoundingBox - charspan: tuple[int, int] + A `ProvenanceItem` object acts as a lightweight pointer back into the original + document for an extracted element. It applies to documents with an explicity + or implicit layout, such as PDF, HTML, docx, or pptx. + """ + + page_no: Annotated[int, Field(description="Page number")] + bbox: Annotated[BoundingBox, Field(description="Bounding box")] + charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")] + + +class ProvenanceTrack(BaseModel): + """Provenance information for elements extracted from media assets. + + A `ProvenanceTrack` instance describes a cue in a text track associated with a + media element (audio, video, subtitles, screen recordings, ...). + """ + + start_time: Annotated[ + _WebVTTTimestamp, + Field( + examples=["00.11.000", "00:00:06.500", "01:28:34.300"], + description="Start time offset of the track cue", + ), + ] + end_time: Annotated[ + _WebVTTTimestamp, + Field( + examples=["00.12.000", "00:00:08.200", "01:29:30.100"], + description="End time offset of the track cue", + ), + ] + identifier: Optional[str] = Field( + None, + examples=["test", "123", "b72d946"], + description="An identifier of the cue", + ) + voice: Optional[str] = Field( + None, + examples=["Mary", "Fred", "Name Surname"], + description="The cue voice (speaker)", + ) + language: Optional[str] = Field( + None, + examples=["en", "en-GB", "fr-CA"], + description="Language of the cue in BCP 47 language tag format", + ) + classes: Optional[list[str]] = Field( + None, + min_length=1, + examples=["first", "loud", "yellow"], + description="Classes for describing the cue significance", + ) + + +def get_provenance_discriminator_value(v: Any) -> str: + """Callable discriminator for provenance instances. + + Args: + v: Either dict or model input. + + Returns: + A string discriminator of provenance instances. + """ + fields = {"bbox", "page_no", "charspan"} + if isinstance(v, dict): + return "item" if any(f in v for f in fields) else "track" + return "item" if any(hasattr(v, f) for f in fields) else "track" + + +ProvenanceType = Annotated[ + Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]], + Discriminator(get_provenance_discriminator_value), +] class ContentLayer(str, Enum): @@ -1468,7 +1541,7 @@ class DocItem(NodeItem): # Base type for any element that carries content, can """DocItem.""" label: DocItemLabel - prov: list[ProvenanceItem] = [] + prov: list[ProvenanceType] = [] comments: list[FineRef] = [] # References to comment items annotating this content @model_serializer(mode="wrap") @@ -1493,7 +1566,7 @@ def get_location_tokens( return "" location = "" - for prov in self.prov: + for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)): page_w, page_h = doc.pages[prov.page_no].size.as_tuple() loc_str = DocumentToken.get_location( @@ -1515,10 +1588,13 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL if a valid image of the page containing this DocItem is not available in doc. """ - if not len(self.prov): + if not self.prov or prov_index >= len(self.prov): + return None + prov = self.prov[prov_index] + if not isinstance(prov, ProvenanceItem): return None - page = doc.pages.get(self.prov[prov_index].page_no) + page = doc.pages.get(prov.page_no) if page is None or page.size is None or page.image is None: return None @@ -1526,9 +1602,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL if not page_image: return None crop_bbox = ( - self.prov[prov_index] - .bbox.to_top_left_origin(page_height=page.size.height) - .scale_to_size(old_size=page.size, new_size=page.image.size) + prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size( + old_size=page.size, new_size=page.image.size + ) # .scaled(scale=page_image.height / page.size.height) ) return page_image.crop(crop_bbox.as_tuple()) @@ -2199,7 +2275,7 @@ def export_to_otsl( return "" page_no = 0 - if len(self.prov) > 0: + if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem): page_no = self.prov[0].page_no for i in range(nrows): @@ -2329,7 +2405,7 @@ class GraphCell(BaseModel): text: str # sanitized text orig: str # text as seen on document - prov: Optional[ProvenanceItem] = None + prov: Optional[ProvenanceType] = None # in case you have a text, table or picture item item_ref: Optional[RefItem] = None @@ -2978,7 +3054,7 @@ def add_list_item( enumerated: bool = False, marker: Optional[str] = None, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -2989,7 +3065,7 @@ def add_list_item( :param label: str: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ @@ -3030,7 +3106,7 @@ def add_text( label: DocItemLabel, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3041,7 +3117,7 @@ def add_text( :param label: str: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ @@ -3167,7 +3243,7 @@ def add_table( self, data: TableData, caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet. - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, label: DocItemLabel = DocItemLabel.TABLE, content_layer: Optional[ContentLayer] = None, @@ -3177,7 +3253,7 @@ def add_table( :param data: TableData: :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) :param label: DocItemLabel: (Default value = DocItemLabel.TABLE) @@ -3213,7 +3289,7 @@ def add_picture( annotations: Optional[list[PictureDataType]] = None, image: Optional[ImageRef] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, ): @@ -3222,7 +3298,7 @@ def add_picture( :param data: Optional[list[PictureData]]: (Default value = None) :param caption: Optional[Union[TextItem: :param RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3254,7 +3330,7 @@ def add_title( self, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3265,7 +3341,7 @@ def add_title( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3300,7 +3376,7 @@ def add_code( code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3313,7 +3389,7 @@ def add_code( :param orig: Optional[str]: (Default value = None) :param caption: Optional[Union[TextItem: :param RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3351,7 +3427,7 @@ def add_formula( self, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3362,7 +3438,7 @@ def add_formula( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3396,7 +3472,7 @@ def add_heading( text: str, orig: Optional[str] = None, level: LevelNumber = 1, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3408,7 +3484,7 @@ def add_heading( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3441,13 +3517,13 @@ def add_heading( def add_key_values( self, graph: GraphData, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, ): """add_key_values. :param graph: GraphData: - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3472,13 +3548,13 @@ def add_key_values( def add_form( self, graph: GraphData, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, parent: Optional[NodeItem] = None, ): """add_form. :param graph: GraphData: - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3667,7 +3743,7 @@ def insert_list_item( enumerated: bool = False, marker: Optional[str] = None, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -3680,7 +3756,7 @@ def insert_list_item( :param enumerated: bool: (Default value = False) :param marker: Optional[str]: (Default value = None) :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -3739,7 +3815,7 @@ def insert_text( label: DocItemLabel, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -3751,7 +3827,7 @@ def insert_text( :param label: DocItemLabel: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -3851,7 +3927,7 @@ def insert_table( sibling: NodeItem, data: TableData, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, label: DocItemLabel = DocItemLabel.TABLE, content_layer: Optional[ContentLayer] = None, annotations: Optional[list[TableAnnotationType]] = None, @@ -3862,7 +3938,7 @@ def insert_table( :param sibling: NodeItem: :param data: TableData: :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param label: DocItemLabel: (Default value = DocItemLabel.TABLE) :param content_layer: Optional[ContentLayer]: (Default value = None) :param annotations: Optional[list[TableAnnotationType]]: (Default value = None) @@ -3899,7 +3975,7 @@ def insert_picture( annotations: Optional[list[PictureDataType]] = None, image: Optional[ImageRef] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, after: bool = True, ) -> PictureItem: @@ -3909,7 +3985,7 @@ def insert_picture( :param annotations: Optional[list[PictureDataType]]: (Default value = None) :param image: Optional[ImageRef]: (Default value = None) :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param after: bool: (Default value = True) @@ -3943,7 +4019,7 @@ def insert_title( sibling: NodeItem, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -3954,7 +4030,7 @@ def insert_title( :param sibling: NodeItem: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -3994,7 +4070,7 @@ def insert_code( code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4007,7 +4083,7 @@ def insert_code( :param code_language: Optional[str]: (Default value = None) :param orig: Optional[str]: (Default value = None) :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4049,7 +4125,7 @@ def insert_formula( sibling: NodeItem, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4060,7 +4136,7 @@ def insert_formula( :param sibling: NodeItem: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4099,7 +4175,7 @@ def insert_heading( text: str, orig: Optional[str] = None, level: LevelNumber = 1, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4111,7 +4187,7 @@ def insert_heading( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4149,14 +4225,14 @@ def insert_key_values( self, sibling: NodeItem, graph: GraphData, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, after: bool = True, ) -> KeyValueItem: """Creates a new KeyValueItem item and inserts it into the document. :param sibling: NodeItem: :param graph: GraphData: - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param after: bool: (Default value = True) :returns: KeyValueItem: The newly created KeyValueItem item. @@ -4178,14 +4254,14 @@ def insert_form( self, sibling: NodeItem, graph: GraphData, - prov: Optional[ProvenanceItem] = None, + prov: Optional[ProvenanceType] = None, after: bool = True, ) -> FormItem: """Creates a new FormItem item and inserts it into the document. :param sibling: NodeItem: :param graph: GraphData: - :param prov: Optional[ProvenanceItem]: (Default value = None) + :param prov: Optional[ProvenanceType]: (Default value = None) :param after: bool: (Default value = True) :returns: FormItem: The newly created FormItem item. @@ -4522,7 +4598,10 @@ def _iterate_items_with_stack( (not isinstance(root, GroupItem) or with_groups) and ( not isinstance(root, DocItem) - or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov)) + or ( + page_nrs is None + or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem)) + ) ) and root.content_layer in my_layers ) @@ -4625,7 +4704,7 @@ def _with_pictures_refs( image_dir.mkdir(parents=True, exist_ok=True) if image_dir.is_dir(): - for item, level in result.iterate_items(page_no=page_no, with_groups=False): + for item, _ in result.iterate_items(page_no=page_no, with_groups=False): if isinstance(item, PictureItem): img = item.get_image(doc=self) if img is not None: @@ -4644,10 +4723,11 @@ def _with_pictures_refs( else: obj_path = loc_path - if item.image is None: + if item.image is None and isinstance(item.prov[0], ProvenanceItem): scale = img.size[0] / item.prov[0].bbox.width item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale)) - item.image.uri = Path(obj_path) + elif item.image is not None: + item.image.uri = Path(obj_path) # if item.image._pil is not None: # item.image._pil.close() @@ -6049,7 +6129,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) -> if isinstance(new_item, DocItem): # update page numbers # NOTE other prov sources (e.g. GraphCell) currently not covered - for prov in new_item.prov: + for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)): prov.page_no += page_delta if item.parent: diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 6d60a2d8..bddd6140 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -23,9 +23,7 @@ class _WebVTTLineTerminator(str, Enum): CR = "\r" -_WebVTTCueIdentifier = Annotated[ - str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") -] +_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] class _WebVTTTimestamp(BaseModel): @@ -39,14 +37,10 @@ class _WebVTTTimestamp(BaseModel): raw: Annotated[ str, - Field( - description="A representation of the WebVTT Timestamp as a single string" - ), + Field(description="A representation of the WebVTT Timestamp as a single string"), ] - _pattern: ClassVar[re.Pattern] = re.compile( - r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$" - ) + _pattern: ClassVar[re.Pattern] = re.compile(r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$") _hours: int _minutes: int _seconds: int @@ -72,12 +66,7 @@ def validate_raw(self) -> Self: @property def seconds(self) -> float: """A representation of the WebVTT Timestamp in seconds.""" - return ( - self._hours * 3600 - + self._minutes * 60 - + self._seconds - + self._millis / 1000.0 - ) + return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0 @override def __str__(self) -> str: @@ -87,9 +76,7 @@ def __str__(self) -> str: class _WebVTTCueTimings(BaseModel): """WebVTT cue timings.""" - start: Annotated[ - _WebVTTTimestamp, Field(description="Start time offset of the cue") - ] + start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")] end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")] @model_validator(mode="after") @@ -116,9 +103,7 @@ def is_valid_text(cls, value: str) -> str: for match in _ENTITY_PATTERN.finditer(value): entity = match.group(1) if entity not in _VALID_ENTITIES: - raise ValueError( - f"Cue text contains an invalid HTML entity: &{entity};" - ) + raise ValueError(f"Cue text contains an invalid HTML entity: &{entity};") if "&" in re.sub(_ENTITY_PATTERN, "", value): raise ValueError("Found '&' not part of a valid entity in the cue text") if any(ch in value for ch in {"\n", "\r", "<"}): @@ -150,20 +135,12 @@ class _WebVTTCueInternalText(BaseModel): terminator: Optional[_WebVTTLineTerminator] = None components: Annotated[ list[_WebVTTCueComponentWithTerminator], - Field( - description=( - "WebVTT caption or subtitle cue components representing the " - "cue internal text" - ) - ), + Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")), ] = [] @override def __str__(self): - cue_str = ( - f"{self.terminator.value if self.terminator else ''}" - f"{''.join(str(span) for span in self.components)}" - ) + cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}" return cue_str @@ -181,9 +158,7 @@ class _WebVTTCueSpanStartTag(BaseModel): def validate_classes(cls, value: list[str]) -> list[str]: for item in value: if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): - raise ValueError( - "A cue span start tag class contains invalid characters" - ) + raise ValueError("A cue span start tag class contains invalid characters") if not item: raise ValueError("Cue span start tag classes cannot be empty") return value @@ -207,9 +182,7 @@ def is_valid_annotation(cls, value: str) -> str: for match in _ENTITY_PATTERN.finditer(value): entity = match.group(1) if entity not in _VALID_ENTITIES: - raise ValueError( - f"Annotation contains an invalid HTML entity: &{entity};" - ) + raise ValueError(f"Annotation contains an invalid HTML entity: &{entity};") if "&" in re.sub(_ENTITY_PATTERN, "", value): raise ValueError("Found '&' not part of a valid entity in annotation") if any(ch in value for ch in {"\n", "\r", ">"}): @@ -324,9 +297,7 @@ class _WebVTTCueBlock(BaseModel): model_config = ConfigDict(regex_engine="python-re") - identifier: Optional[_WebVTTCueIdentifier] = Field( - None, description="The WebVTT cue identifier" - ) + identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier") timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] payload: Annotated[ list[_WebVTTCueComponentWithTerminator], @@ -368,9 +339,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": start, end = [t.strip() for t in timing_line.split("-->")] end = re.split(" |\t", end)[0] # ignore the cue settings list - timings: _WebVTTCueTimings = _WebVTTCueTimings( - start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end) - ) + timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)) cue_text = " ".join(cue_lines).strip() # adding close tag for cue spans without end tag for omm in {"v"}: @@ -388,9 +357,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": match = matches[i] if match.start() > pos: stack[-1].append( - _WebVTTCueComponentWithTerminator( - component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()]) - ) + _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) ) gps = {k: (v if v else None) for k, v in match.groupdict().items()} @@ -410,9 +377,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": if class_string: classes = [c for c in class_string.split(".") if c] st = ( - _WebVTTCueSpanStartTagAnnotated( - name=ct, classes=classes, annotation=annotation.strip() - ) + _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip()) if annotation else _WebVTTCueSpanStartTag(name=ct, classes=classes) ) @@ -430,19 +395,13 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it) elif ct == "v": cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it) - stack[-1].append( - _WebVTTCueComponentWithTerminator(component=cp) - ) + stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp)) pos = match.end() i += 1 if pos < len(cue_text): - stack[-1].append( - _WebVTTCueComponentWithTerminator( - component=_WebVTTCueTextSpan(text=cue_text[pos:]) - ) - ) + stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:]))) return cls( identifier=identifier, diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py index 04761799..5ebac4be 100644 --- a/docling_core/utils/legacy.py +++ b/docling_core/utils/legacy.py @@ -7,20 +7,23 @@ from docling_core.types.doc import ( BoundingBox, + ContentLayer, CoordOrigin, DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, + ListItem, PictureItem, ProvenanceItem, SectionHeaderItem, Size, TableCell, + TableData, TableItem, TextItem, ) -from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData from docling_core.types.legacy_doc.base import ( BaseCell, BaseText, @@ -162,6 +165,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f span=[0, len(item.text)], ) for p in item.prov + if isinstance(p, ProvenanceItem) ] main_text.append( BaseText( @@ -283,6 +287,7 @@ def _make_spans(cell: TableCell, table_item: TableItem): span=[0, 0], ) for p in item.prov + if isinstance(p, ProvenanceItem) ], ) ) @@ -310,6 +315,7 @@ def _make_spans(cell: TableCell, table_item: TableItem): span=[0, len(caption)], ) for p in item.prov + if isinstance(p, ProvenanceItem) ], obj_type=doc_item_label_to_legacy_type(item.label), text=caption, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 03b7d8cd..eca74ef4 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -233,7 +233,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -651,7 +658,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -793,7 +807,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -873,13 +894,21 @@ "prov": { "anyOf": [ { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, { "type": "null" } ], - "default": null + "default": null, + "title": "Prov" }, "item_ref": { "anyOf": [ @@ -1198,7 +1227,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -1370,7 +1406,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -1746,7 +1789,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -2139,16 +2189,19 @@ "type": "object" }, "ProvenanceItem": { - "description": "ProvenanceItem.", + "description": "Provenance information for elements extracted from a textual document.\n\nA `ProvenanceItem` object acts as a lightweight pointer back into the original\ndocument for an extracted element. It applies to documents with an explicity\nor implicit layout, such as PDF, HTML, docx, or pptx.", "properties": { "page_no": { + "description": "Page number", "title": "Page No", "type": "integer" }, "bbox": { - "$ref": "#/$defs/BoundingBox" + "$ref": "#/$defs/BoundingBox", + "description": "Bounding box" }, "charspan": { + "description": "Character span (0-indexed)", "maxItems": 2, "minItems": 2, "prefixItems": [ @@ -2171,6 +2224,111 @@ "title": "ProvenanceItem", "type": "object" }, + "ProvenanceTrack": { + "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).", + "properties": { + "start_time": { + "$ref": "#/$defs/_WebVTTTimestamp", + "description": "Start time offset of the track cue", + "examples": [ + "00.11.000", + "00:00:06.500", + "01:28:34.300" + ] + }, + "end_time": { + "$ref": "#/$defs/_WebVTTTimestamp", + "description": "End time offset of the track cue", + "examples": [ + "00.12.000", + "00:00:08.200", + "01:29:30.100" + ] + }, + "identifier": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An identifier of the cue", + "examples": [ + "test", + "123", + "b72d946" + ], + "title": "Identifier" + }, + "voice": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The cue voice (speaker)", + "examples": [ + "Mary", + "Fred", + "Name Surname" + ], + "title": "Voice" + }, + "language": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Language of the cue in BCP 47 language tag format", + "examples": [ + "en", + "en-GB", + "fr-CA" + ], + "title": "Language" + }, + "classes": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Classes for describing the cue significance", + "examples": [ + "first", + "loud", + "yellow" + ], + "title": "Classes" + } + }, + "required": [ + "start_time", + "end_time" + ], + "title": "ProvenanceTrack", + "type": "object" + }, "RefItem": { "description": "RefItem.", "properties": { @@ -2327,7 +2485,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -2622,7 +2787,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -2827,7 +2999,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -2939,7 +3118,14 @@ "prov": { "default": [], "items": { - "$ref": "#/$defs/ProvenanceItem" + "oneOf": [ + { + "$ref": "#/$defs/ProvenanceItem" + }, + { + "$ref": "#/$defs/ProvenanceTrack" + } + ] }, "title": "Prov", "type": "array" @@ -2997,6 +3183,21 @@ ], "title": "TitleItem", "type": "object" + }, + "_WebVTTTimestamp": { + "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.", + "properties": { + "raw": { + "description": "A representation of the WebVTT Timestamp as a single string", + "title": "Raw", + "type": "string" + } + }, + "required": [ + "raw" + ], + "title": "_WebVTTTimestamp", + "type": "object" } }, "description": "DoclingDocument.", From b9bb0535ff4fe0fe6e9d9488fc22fc8d1a5151f9 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Thu, 4 Dec 2025 14:49:53 +0100 Subject: [PATCH 05/20] refactor(webvtt): make WebVTTTimestamp public Since WebVTTTimestamp is used in DoclingDocument, the class should be public. Strengthen validation of cue language start tag annotation. Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/__init__.py | 1 + docling_core/types/doc/document.py | 6 ++-- docling_core/types/doc/webvtt.py | 50 ++++++++++++++++++++++++------ docs/DoclingDocument.json | 10 +++--- test/test_webvtt.py | 31 ++++++++++++------ 5 files changed, 70 insertions(+), 28 deletions(-) diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index f0e0e92d..d8ddd0b4 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -56,6 +56,7 @@ PictureStackedBarChartData, PictureTabularChartData, ProvenanceItem, + ProvenanceTrack, RefItem, RichTableCell, Script, diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index e071b2b9..acb8b7a5 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -67,7 +67,7 @@ ) from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import parse_otsl_table_content, relative_path -from docling_core.types.doc.webvtt import _WebVTTTimestamp +from docling_core.types.doc.webvtt import WebVTTTimestamp _logger = logging.getLogger(__name__) @@ -1178,14 +1178,14 @@ class ProvenanceTrack(BaseModel): """ start_time: Annotated[ - _WebVTTTimestamp, + WebVTTTimestamp, Field( examples=["00.11.000", "00:00:06.500", "01:28:34.300"], description="Start time offset of the track cue", ), ] end_time: Annotated[ - _WebVTTTimestamp, + WebVTTTimestamp, Field( examples=["00.12.000", "00:00:08.200", "01:29:30.100"], description="End time offset of the track cue", diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index bddd6140..f6a6ea73 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -26,9 +26,18 @@ class _WebVTTLineTerminator(str, Enum): _WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] -class _WebVTTTimestamp(BaseModel): +class WebVTTTimestamp(BaseModel): """WebVTT timestamp. + The timestamp is a string consisting of the following components in the given order: + + - hours (optional, required if non-zero): two or more digits + - minutes: two digits between 0 and 59 + - a colon character (:) + - seconds: two digits between 0 and 59 + - a full stop character (.) + - thousandths of a second: three digits + A WebVTT timestamp is always interpreted relative to the current playback position of the media data that the WebVTT file is to be synchronized with. """ @@ -48,6 +57,7 @@ class _WebVTTTimestamp(BaseModel): @model_validator(mode="after") def validate_raw(self) -> Self: + """Validate the WebVTT timestamp as a string.""" m = self._pattern.match(self.raw) if not m: raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}") @@ -70,14 +80,15 @@ def seconds(self) -> float: @override def __str__(self) -> str: + """Return a string representation of a WebVTT timestamp.""" return self.raw class _WebVTTCueTimings(BaseModel): """WebVTT cue timings.""" - start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")] - end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")] + start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")] + end: Annotated[WebVTTTimestamp, Field(description="End time offset of the cue")] @model_validator(mode="after") def check_order(self) -> Self: @@ -197,6 +208,21 @@ def __str__(self): return f"<{self._get_name_with_classes()} {self.annotation}>" +class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated): + _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE) + + name: Literal["lang"] = Field("lang", description="The tag name") + + @field_validator("annotation", mode="after") + @classmethod + @override + def is_valid_annotation(cls, value: str) -> str: + if cls._pattern.match(value): + return value + else: + raise ValueError("Annotation should be in BCP 47 language tag format") + + class _WebVTTCueComponentBase(BaseModel): """WebVTT caption or subtitle cue component. @@ -267,7 +293,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase): """ kind: Literal["lang"] = "lang" - start_tag: _WebVTTCueSpanStartTagAnnotated + start_tag: _WebVTTCueLanguageSpanStartTag _WebVTTCueComponent = Annotated[ @@ -339,7 +365,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": start, end = [t.strip() for t in timing_line.split("-->")] end = re.split(" |\t", end)[0] # ignore the cue settings list - timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)) + timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end)) cue_text = " ".join(cue_lines).strip() # adding close tag for cue spans without end tag for omm in {"v"}: @@ -376,11 +402,15 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": classes: list[str] = [] if class_string: classes = [c for c in class_string.split(".") if c] - st = ( - _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip()) - if annotation - else _WebVTTCueSpanStartTag(name=ct, classes=classes) - ) + st: _WebVTTCueSpanStartTag + if annotation and ct == "lang": + st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip()) + elif annotation: + st = _WebVTTCueSpanStartTagAnnotated( + name=ct, classes=classes, annotation=annotation.strip() + ) + else: + st = _WebVTTCueSpanStartTag(name=ct, classes=classes) it = _WebVTTCueInternalText(components=children) cp: _WebVTTCueComponent if ct == "c": diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index eca74ef4..adc3aac5 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2228,7 +2228,7 @@ "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).", "properties": { "start_time": { - "$ref": "#/$defs/_WebVTTTimestamp", + "$ref": "#/$defs/WebVTTTimestamp", "description": "Start time offset of the track cue", "examples": [ "00.11.000", @@ -2237,7 +2237,7 @@ ] }, "end_time": { - "$ref": "#/$defs/_WebVTTTimestamp", + "$ref": "#/$defs/WebVTTTimestamp", "description": "End time offset of the track cue", "examples": [ "00.12.000", @@ -3184,8 +3184,8 @@ "title": "TitleItem", "type": "object" }, - "_WebVTTTimestamp": { - "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.", + "WebVTTTimestamp": { + "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.", "properties": { "raw": { "description": "A representation of the WebVTT Timestamp as a single string", @@ -3196,7 +3196,7 @@ "required": [ "raw" ], - "title": "_WebVTTTimestamp", + "title": "WebVTTTimestamp", "type": "object" } }, diff --git a/test/test_webvtt.py b/test/test_webvtt.py index b4d408cb..f4013831 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -9,17 +9,18 @@ from pydantic import ValidationError from docling_core.types.doc.webvtt import ( + WebVTTTimestamp, _WebVTTCueBlock, _WebVTTCueComponentWithTerminator, _WebVTTCueInternalText, _WebVTTCueItalicSpan, _WebVTTCueLanguageSpan, + _WebVTTCueLanguageSpanStartTag, _WebVTTCueSpanStartTagAnnotated, _WebVTTCueTextSpan, _WebVTTCueTimings, _WebVTTCueVoiceSpan, _WebVTTFile, - _WebVTTTimestamp, ) from .test_data_gen_flag import GEN_TEST_DATA @@ -42,7 +43,7 @@ def test_vtt_cue_commponents() -> None: 0.0, ] for idx, ts in enumerate(valid_timestamps): - model = _WebVTTTimestamp(raw=ts) + model = WebVTTTimestamp(raw=ts) assert model.seconds == valid_total_seconds[idx] """Test invalid WebVTT timestamps.""" @@ -57,35 +58,35 @@ def test_vtt_cue_commponents() -> None: ] for ts in invalid_timestamps: with pytest.raises(ValidationError): - _WebVTTTimestamp(raw=ts) + WebVTTTimestamp(raw=ts) """Test the timestamp __str__ method.""" - model = _WebVTTTimestamp(raw="00:01:02.345") + model = WebVTTTimestamp(raw="00:01:02.345") assert str(model) == "00:01:02.345" """Test valid cue timings.""" - start = _WebVTTTimestamp(raw="00:10.005") - end = _WebVTTTimestamp(raw="00:14.007") + start = WebVTTTimestamp(raw="00:10.005") + end = WebVTTTimestamp(raw="00:14.007") cue_timings = _WebVTTCueTimings(start=start, end=end) assert cue_timings.start == start assert cue_timings.end == end assert str(cue_timings) == "00:10.005 --> 00:14.007" """Test invalid cue timings with end timestamp before start.""" - start = _WebVTTTimestamp(raw="00:10.700") - end = _WebVTTTimestamp(raw="00:10.500") + start = WebVTTTimestamp(raw="00:10.700") + end = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: _WebVTTCueTimings(start=start, end=end) assert "End timestamp must be greater than start timestamp" in str(excinfo.value) """Test invalid cue timings with missing end.""" - start = _WebVTTTimestamp(raw="00:10.500") + start = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: _WebVTTCueTimings(start=start) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) """Test invalid cue timings with missing start.""" - end = _WebVTTTimestamp(raw="00:10.500") + end = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: _WebVTTCueTimings(end=end) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) @@ -272,3 +273,13 @@ def test_webvtt_file() -> None: assert len(block.payload) == 1 assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) assert block.payload[0].component.text == "Good." + + +def test_webvtt_cue_language_span_start_tag(): + _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}') + _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}') + _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}') + with pytest.raises(ValidationError, match="BCP 47"): + _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}') + with pytest.raises(ValidationError, match="BCP 47"): + _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}') From b26c08662c5488b33e6ebb3043c05ae754ad4a43 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 12 Dec 2025 10:49:23 +0100 Subject: [PATCH 06/20] refactor(webvtt): set languages to a list of strings in ProvenanceTrack Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/document.py | 6 +++--- docs/DoclingDocument.json | 21 ++++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index acb8b7a5..6f02c54f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1201,10 +1201,10 @@ class ProvenanceTrack(BaseModel): examples=["Mary", "Fred", "Name Surname"], description="The cue voice (speaker)", ) - language: Optional[str] = Field( + languages: Optional[list[str]] = Field( None, - examples=["en", "en-GB", "fr-CA"], - description="Language of the cue in BCP 47 language tag format", + examples=[["en", "en-GB"], ["fr-CA"]], + description="Languages of the cue in BCP 47 language tag format", ) classes: Optional[list[str]] = Field( None, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index adc3aac5..35175601 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2281,23 +2281,30 @@ ], "title": "Voice" }, - "language": { + "languages": { "anyOf": [ { - "type": "string" + "items": { + "type": "string" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "Language of the cue in BCP 47 language tag format", + "description": "Languages of the cue in BCP 47 language tag format", "examples": [ - "en", - "en-GB", - "fr-CA" + [ + "en", + "en-GB" + ], + [ + "fr-CA" + ] ], - "title": "Language" + "title": "Languages" }, "classes": { "anyOf": [ From d0c97fcfa128d2b9277692cf45ede7f4cb4781af Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 12 Dec 2025 11:04:59 +0100 Subject: [PATCH 07/20] tests(webvtt): add test for ProvenanceTrack Signed-off-by: Cesar Berrospi Ramis --- test/test_doc_base.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 709e2eac..18d2cf11 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -1,6 +1,8 @@ import pytest from pydantic import ValidationError +from docling_core.types.doc.document import ProvenanceTrack +from docling_core.types.doc.webvtt import WebVTTTimestamp from docling_core.types.legacy_doc.base import Prov, S3Reference @@ -37,3 +39,34 @@ def test_prov(): with pytest.raises(ValidationError, match="at least 2 items"): prov["span"] = [0] Prov(**prov) + + +def test_prov_track(): + """Test the class ProvenanceTrack.""" + + valid_track = ProvenanceTrack( + start_time=WebVTTTimestamp(raw="00:11.000"), + end_time=WebVTTTimestamp(raw="00:12.000"), + identifier="test", + voice="Mary", + languages=["en", "en-GB"], + classes=["v.first.loud", "i.foreignphrase"], + ) + + assert valid_track + assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000") + assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000") + assert valid_track.identifier == "test" + assert valid_track.voice == "Mary" + assert valid_track.languages == ["en", "en-GB"] + assert valid_track.classes == ["v.first.loud", "i.foreignphrase"] + + with pytest.raises(ValidationError, match="end_time"): + ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000")) + + with pytest.raises(ValidationError, match="should be a valid list"): + ProvenanceTrack( + start_time=WebVTTTimestamp(raw="00:11.000"), + end_time=WebVTTTimestamp(raw="00:12.000"), + languages="en", + ) From 86d7fe49a8ee400c21be6527d9be02a04d0f6484 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 12 Dec 2025 11:29:51 +0100 Subject: [PATCH 08/20] refactor(webvtt): make all WebVTT classes public for reuse Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 156 +++++++++++++++++++------------ test/test_webvtt.py | 130 +++++++++++++------------- 2 files changed, 161 insertions(+), 125 deletions(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index f6a6ea73..550498a9 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -84,7 +84,7 @@ def __str__(self) -> str: return self.raw -class _WebVTTCueTimings(BaseModel): +class WebVTTCueTimings(BaseModel): """WebVTT cue timings.""" start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")] @@ -92,6 +92,7 @@ class _WebVTTCueTimings(BaseModel): @model_validator(mode="after") def check_order(self) -> Self: + """Ensure start timestamp is less than or equal to end timestamp.""" if self.start and self.end: if self.end.seconds <= self.start.seconds: raise ValueError("End timestamp must be greater than start timestamp") @@ -99,10 +100,11 @@ def check_order(self) -> Self: @override def __str__(self): + """Return a string representation of the cue timings.""" return f"{self.start} --> {self.end}" -class _WebVTTCueTextSpan(BaseModel): +class WebVTTCueTextSpan(BaseModel): """WebVTT cue text span.""" kind: Literal["text"] = "text" @@ -111,6 +113,7 @@ class _WebVTTCueTextSpan(BaseModel): @field_validator("text", mode="after") @classmethod def is_valid_text(cls, value: str) -> str: + """Ensure cue text contains only permitted characters and HTML entities.""" for match in _ENTITY_PATTERN.finditer(value): entity = match.group(1) if entity not in _VALID_ENTITIES: @@ -126,36 +129,39 @@ def is_valid_text(cls, value: str) -> str: @override def __str__(self): + """Return a string representation of the cue text span.""" return self.text -class _WebVTTCueComponentWithTerminator(BaseModel): +class WebVTTCueComponentWithTerminator(BaseModel): """WebVTT caption or subtitle cue component optionally with a line terminator.""" - component: "_WebVTTCueComponent" + component: "WebVTTCueComponent" terminator: Optional[_WebVTTLineTerminator] = None @override def __str__(self): + """Return a string representation of the cue component with terminator.""" return f"{self.component}{self.terminator.value if self.terminator else ''}" -class _WebVTTCueInternalText(BaseModel): +class WebVTTCueInternalText(BaseModel): """WebVTT cue internal text.""" terminator: Optional[_WebVTTLineTerminator] = None components: Annotated[ - list[_WebVTTCueComponentWithTerminator], + list[WebVTTCueComponentWithTerminator], Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")), ] = [] @override def __str__(self): + """Return a string representation of the cue internal text.""" cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}" return cue_str -class _WebVTTCueSpanStartTag(BaseModel): +class WebVTTCueSpanStartTag(BaseModel): """WebVTT cue span start tag.""" name: Annotated[_START_TAG_NAMES, Field(description="The tag name")] @@ -167,6 +173,7 @@ class _WebVTTCueSpanStartTag(BaseModel): @field_validator("classes", mode="after") @classmethod def validate_classes(cls, value: list[str]) -> list[str]: + """Validate cue span start tag classes.""" for item in value: if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): raise ValueError("A cue span start tag class contains invalid characters") @@ -175,14 +182,16 @@ def validate_classes(cls, value: list[str]) -> list[str]: return value def _get_name_with_classes(self) -> str: + """Return the name of the cue span start tag with classes.""" return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name @override def __str__(self): + """Return a string representation of the cue span start tag.""" return f"<{self._get_name_with_classes()}>" -class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag): +class WebVTTCueSpanStartTagAnnotated(WebVTTCueSpanStartTag): """WebVTT cue span start tag requiring an annotation.""" annotation: Annotated[str, Field(description="Cue span start tag annotation")] @@ -190,6 +199,7 @@ class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag): @field_validator("annotation", mode="after") @classmethod def is_valid_annotation(cls, value: str) -> str: + """Ensure annotation contains only permitted characters and HTML entities.""" for match in _ENTITY_PATTERN.finditer(value): entity = match.group(1) if entity not in _VALID_ENTITIES: @@ -205,10 +215,13 @@ def is_valid_annotation(cls, value: str) -> str: @override def __str__(self): + """Return a string representation of the cue span start tag.""" return f"<{self._get_name_with_classes()} {self.annotation}>" -class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated): +class WebVTTCueLanguageSpanStartTag(WebVTTCueSpanStartTagAnnotated): + """WebVTT cue language span start tag.""" + _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE) name: Literal["lang"] = Field("lang", description="The tag name") @@ -217,13 +230,14 @@ class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated): @classmethod @override def is_valid_annotation(cls, value: str) -> str: + """Ensure that the language annotation is in BCP 47 language tag format.""" if cls._pattern.match(value): return value else: raise ValueError("Annotation should be in BCP 47 language tag format") -class _WebVTTCueComponentBase(BaseModel): +class WebVTTCueComponentBase(BaseModel): """WebVTT caption or subtitle cue component. All the WebVTT caption or subtitle cue components are represented by this class @@ -231,28 +245,30 @@ class _WebVTTCueComponentBase(BaseModel): """ kind: Literal["c", "b", "i", "u", "v", "lang"] - start_tag: _WebVTTCueSpanStartTag - internal_text: _WebVTTCueInternalText + start_tag: WebVTTCueSpanStartTag + internal_text: WebVTTCueInternalText @model_validator(mode="after") def check_tag_names_match(self) -> Self: + """Ensure that the start tag name matches this cue component type.""" if self.kind != self.start_tag.name: raise ValueError("The tag name of this cue component should be {self.kind}") return self @override def __str__(self): + """Return a string representation of the cue component.""" return f"{self.start_tag}{self.internal_text}" -class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase): +class WebVTTCueVoiceSpan(WebVTTCueComponentBase): """WebVTT cue voice span associated with a specific voice.""" kind: Literal["v"] = "v" - start_tag: _WebVTTCueSpanStartTagAnnotated + start_tag: WebVTTCueSpanStartTagAnnotated -class _WebVTTCueClassSpan(_WebVTTCueComponentBase): +class WebVTTCueClassSpan(WebVTTCueComponentBase): """WebVTT cue class span. It represents a span of text and it is used to annotate parts of the cue with @@ -260,31 +276,31 @@ class _WebVTTCueClassSpan(_WebVTTCueComponentBase): """ kind: Literal["c"] = "c" - start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c") + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="c") -class _WebVTTCueItalicSpan(_WebVTTCueComponentBase): +class WebVTTCueItalicSpan(WebVTTCueComponentBase): """WebVTT cue italic span representing a span of italic text.""" kind: Literal["i"] = "i" - start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i") + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="i") -class _WebVTTCueBoldSpan(_WebVTTCueComponentBase): +class WebVTTCueBoldSpan(WebVTTCueComponentBase): """WebVTT cue bold span representing a span of bold text.""" kind: Literal["b"] = "b" - start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b") + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="b") -class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase): +class WebVTTCueUnderlineSpan(WebVTTCueComponentBase): """WebVTT cue underline span representing a span of underline text.""" kind: Literal["u"] = "u" - start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u") + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="u") -class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase): +class WebVTTCueLanguageSpan(WebVTTCueComponentBase): """WebVTT cue language span. It represents a span of text and it is used to annotate parts of the cue where the @@ -293,18 +309,18 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase): """ kind: Literal["lang"] = "lang" - start_tag: _WebVTTCueLanguageSpanStartTag + start_tag: WebVTTCueLanguageSpanStartTag -_WebVTTCueComponent = Annotated[ +WebVTTCueComponent = Annotated[ Union[ - _WebVTTCueTextSpan, - _WebVTTCueClassSpan, - _WebVTTCueItalicSpan, - _WebVTTCueBoldSpan, - _WebVTTCueUnderlineSpan, - _WebVTTCueVoiceSpan, - _WebVTTCueLanguageSpan, + WebVTTCueTextSpan, + WebVTTCueClassSpan, + WebVTTCueItalicSpan, + WebVTTCueBoldSpan, + WebVTTCueUnderlineSpan, + WebVTTCueVoiceSpan, + WebVTTCueLanguageSpan, ], Field( discriminator="kind", @@ -313,7 +329,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase): ] -class _WebVTTCueBlock(BaseModel): +class WebVTTCueBlock(BaseModel): """Model representing a WebVTT cue block. The optional WebVTT cue settings list is not supported. @@ -324,9 +340,9 @@ class _WebVTTCueBlock(BaseModel): model_config = ConfigDict(regex_engine="python-re") identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier") - timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] + timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")] payload: Annotated[ - list[_WebVTTCueComponentWithTerminator], + list[WebVTTCueComponentWithTerminator], Field(description="The WebVTT caption or subtitle cue text"), ] @@ -341,13 +357,22 @@ class _WebVTTCueBlock(BaseModel): @field_validator("payload", mode="after") @classmethod def validate_payload(cls, payload): + """Ensure that the cue payload contains valid text.""" for voice in payload: if "-->" in str(voice): raise ValueError("Cue payload must not contain '-->'") return payload @classmethod - def parse(cls, raw: str) -> "_WebVTTCueBlock": + def parse(cls, raw: str) -> "WebVTTCueBlock": + """Parse a WebVTT cue block from a string. + + Args: + raw: The raw WebVTT cue block string. + + Returns: + The parsed WebVTT cue block. + """ lines = raw.strip().splitlines() if not lines: raise ValueError("Cue block must have at least one line") @@ -365,7 +390,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": start, end = [t.strip() for t in timing_line.split("-->")] end = re.split(" |\t", end)[0] # ignore the cue settings list - timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end)) + timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end)) cue_text = " ".join(cue_lines).strip() # adding close tag for cue spans without end tag for omm in {"v"}: @@ -373,7 +398,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": cue_text += f"" break - stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]] + stack: list[list[WebVTTCueComponentWithTerminator]] = [[]] tag_stack: list[dict] = [] pos = 0 @@ -383,7 +408,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": match = matches[i] if match.start() > pos: stack[-1].append( - _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) + WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()])) ) gps = {k: (v if v else None) for k, v in match.groupdict().items()} @@ -402,36 +427,34 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": classes: list[str] = [] if class_string: classes = [c for c in class_string.split(".") if c] - st: _WebVTTCueSpanStartTag + st: WebVTTCueSpanStartTag if annotation and ct == "lang": - st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip()) + st = WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip()) elif annotation: - st = _WebVTTCueSpanStartTagAnnotated( - name=ct, classes=classes, annotation=annotation.strip() - ) + st = WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip()) else: - st = _WebVTTCueSpanStartTag(name=ct, classes=classes) - it = _WebVTTCueInternalText(components=children) - cp: _WebVTTCueComponent + st = WebVTTCueSpanStartTag(name=ct, classes=classes) + it = WebVTTCueInternalText(components=children) + cp: WebVTTCueComponent if ct == "c": - cp = _WebVTTCueClassSpan(start_tag=st, internal_text=it) + cp = WebVTTCueClassSpan(start_tag=st, internal_text=it) elif ct == "b": - cp = _WebVTTCueBoldSpan(start_tag=st, internal_text=it) + cp = WebVTTCueBoldSpan(start_tag=st, internal_text=it) elif ct == "i": - cp = _WebVTTCueItalicSpan(start_tag=st, internal_text=it) + cp = WebVTTCueItalicSpan(start_tag=st, internal_text=it) elif ct == "u": - cp = _WebVTTCueUnderlineSpan(start_tag=st, internal_text=it) + cp = WebVTTCueUnderlineSpan(start_tag=st, internal_text=it) elif ct == "lang": - cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it) + cp = WebVTTCueLanguageSpan(start_tag=st, internal_text=it) elif ct == "v": - cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it) - stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp)) + cp = WebVTTCueVoiceSpan(start_tag=st, internal_text=it) + stack[-1].append(WebVTTCueComponentWithTerminator(component=cp)) pos = match.end() i += 1 if pos < len(cue_text): - stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:]))) + stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:]))) return cls( identifier=identifier, @@ -440,6 +463,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock": ) def __str__(self): + """Return a string representation of the WebVTT cue block.""" parts = [] if self.identifier: parts.append(f"{self.identifier}\n") @@ -455,13 +479,14 @@ def __str__(self): return "".join(parts) + "\n" -class _WebVTTFile(BaseModel): +class WebVTTFile(BaseModel): """A model representing a WebVTT file.""" - cue_blocks: list[_WebVTTCueBlock] + cue_blocks: list[WebVTTCueBlock] @staticmethod def verify_signature(content: str) -> bool: + """Verify the WebVTT file signature.""" if not content: return False elif len(content) == 6: @@ -472,7 +497,15 @@ def verify_signature(content: str) -> bool: return False @classmethod - def parse(cls, raw: str) -> "_WebVTTFile": + def parse(cls, raw: str) -> "WebVTTFile": + """Parse a WebVTT file. + + Args: + raw: The raw WebVTT file content. + + Returns: + The parsed WebVTT file. + """ # Normalize newlines to LF raw = raw.replace("\r\n", "\n").replace("\r", "\n") @@ -490,20 +523,23 @@ def parse(cls, raw: str) -> "_WebVTTFile": # Split into cue blocks raw_blocks = re.split(r"\n\s*\n", body.strip()) - cues: list[_WebVTTCueBlock] = [] + cues: list[WebVTTCueBlock] = [] for block in raw_blocks: try: - cues.append(_WebVTTCueBlock.parse(block)) + cues.append(WebVTTCueBlock.parse(block)) except ValueError as e: _log.warning(f"Failed to parse cue block:\n{block}\n{e}") return cls(cue_blocks=cues) def __iter__(self): + """Return an iterator over the cue blocks.""" return iter(self.cue_blocks) def __getitem__(self, idx): + """Return the cue block at the given index.""" return self.cue_blocks[idx] def __len__(self): + """Return the number of cue blocks.""" return len(self.cue_blocks) diff --git a/test/test_webvtt.py b/test/test_webvtt.py index f4013831..9e47f1a8 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -9,18 +9,18 @@ from pydantic import ValidationError from docling_core.types.doc.webvtt import ( + WebVTTCueBlock, + WebVTTCueComponentWithTerminator, + WebVTTCueInternalText, + WebVTTCueItalicSpan, + WebVTTCueLanguageSpan, + WebVTTCueLanguageSpanStartTag, + WebVTTCueSpanStartTagAnnotated, + WebVTTCueTextSpan, + WebVTTCueTimings, + WebVTTCueVoiceSpan, + WebVTTFile, WebVTTTimestamp, - _WebVTTCueBlock, - _WebVTTCueComponentWithTerminator, - _WebVTTCueInternalText, - _WebVTTCueItalicSpan, - _WebVTTCueLanguageSpan, - _WebVTTCueLanguageSpanStartTag, - _WebVTTCueSpanStartTagAnnotated, - _WebVTTCueTextSpan, - _WebVTTCueTimings, - _WebVTTCueVoiceSpan, - _WebVTTFile, ) from .test_data_gen_flag import GEN_TEST_DATA @@ -67,7 +67,7 @@ def test_vtt_cue_commponents() -> None: """Test valid cue timings.""" start = WebVTTTimestamp(raw="00:10.005") end = WebVTTTimestamp(raw="00:14.007") - cue_timings = _WebVTTCueTimings(start=start, end=end) + cue_timings = WebVTTCueTimings(start=start, end=end) assert cue_timings.start == start assert cue_timings.end == end assert str(cue_timings) == "00:10.005 --> 00:14.007" @@ -76,92 +76,92 @@ def test_vtt_cue_commponents() -> None: start = WebVTTTimestamp(raw="00:10.700") end = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(start=start, end=end) + WebVTTCueTimings(start=start, end=end) assert "End timestamp must be greater than start timestamp" in str(excinfo.value) """Test invalid cue timings with missing end.""" start = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(start=start) # type: ignore[call-arg] + WebVTTCueTimings(start=start) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) """Test invalid cue timings with missing start.""" end = WebVTTTimestamp(raw="00:10.500") with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(end=end) # type: ignore[call-arg] + WebVTTCueTimings(end=end) # type: ignore[call-arg] assert "Field required" in str(excinfo.value) """Test with valid text.""" valid_text = "This is a valid cue text span." - span = _WebVTTCueTextSpan(text=valid_text) + span = WebVTTCueTextSpan(text=valid_text) assert span.text == valid_text assert str(span) == valid_text """Test with text containing newline characters.""" invalid_text = "This cue text span\ncontains a newline." with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) + WebVTTCueTextSpan(text=invalid_text) """Test with text containing ampersand.""" invalid_text = "This cue text span contains &." with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) + WebVTTCueTextSpan(text=invalid_text) invalid_text = "An invalid &foo; entity" with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) + WebVTTCueTextSpan(text=invalid_text) valid_text = "My favorite book is Pride & Prejudice" - span = _WebVTTCueTextSpan(text=valid_text) + span = WebVTTCueTextSpan(text=valid_text) assert span.text == valid_text """Test with text containing less-than sign.""" invalid_text = "This cue text span contains <." with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) + WebVTTCueTextSpan(text=invalid_text) """Test with empty text.""" with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text="") + WebVTTCueTextSpan(text="") """Test that annotation validation works correctly.""" valid_annotation = "valid-annotation" invalid_annotation = "invalid\nannotation" with pytest.raises(ValidationError): - _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation) - assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation) + WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation) + assert WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation) """Test that classes validation works correctly.""" annotation = "speaker name" valid_classes = ["class1", "class2"] invalid_classes = ["class\nwith\nnewlines", ""] with pytest.raises(ValidationError): - _WebVTTCueSpanStartTagAnnotated( + WebVTTCueSpanStartTagAnnotated( name="v", annotation=annotation, classes=invalid_classes ) - assert _WebVTTCueSpanStartTagAnnotated( + assert WebVTTCueSpanStartTagAnnotated( name="v", annotation=annotation, classes=valid_classes ) """Test that components validation works correctly.""" annotation = "speaker name" valid_components = [ - _WebVTTCueComponentWithTerminator( - component=_WebVTTCueTextSpan(text="random text") + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") ) ] invalid_components = [123, "not a component"] with pytest.raises(ValidationError): - _WebVTTCueInternalText(components=invalid_components) - assert _WebVTTCueInternalText(components=valid_components) + WebVTTCueInternalText(components=invalid_components) + assert WebVTTCueInternalText(components=valid_components) """Test valid cue voice spans.""" - cue_span = _WebVTTCueVoiceSpan( - start_tag=_WebVTTCueSpanStartTagAnnotated( + cue_span = WebVTTCueVoiceSpan( + start_tag=WebVTTCueSpanStartTagAnnotated( name="v", annotation="speaker", classes=["loud", "clear"] ), - internal_text=_WebVTTCueInternalText( + internal_text=WebVTTCueInternalText( components=[ - _WebVTTCueComponentWithTerminator( - component=_WebVTTCueTextSpan(text="random text") + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") ) ] ), @@ -169,12 +169,12 @@ def test_vtt_cue_commponents() -> None: expected_str = "random text" assert str(cue_span) == expected_str - cue_span = _WebVTTCueVoiceSpan( - start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"), - internal_text=_WebVTTCueInternalText( + cue_span = WebVTTCueVoiceSpan( + start_tag=WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"), + internal_text=WebVTTCueInternalText( components=[ - _WebVTTCueComponentWithTerminator( - component=_WebVTTCueTextSpan(text="random text") + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") ) ] ), @@ -188,11 +188,11 @@ def test_webvttcueblock_parse() -> None: raw: str = ( "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n" ) - block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw) + block: WebVTTCueBlock = WebVTTCueBlock.parse(raw) assert str(block.timings) == "04:02.500 --> 04:05.000" assert len(block.payload) == 1 - assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator) - assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) assert ( block.payload[0].component.text == "J’ai commencé le basket à l'âge de 13, 14 ans" @@ -203,23 +203,23 @@ def test_webvttcueblock_parse() -> None: "04:05.001 --> 04:07.800\n" "Sur les playground, ici à Montpellier\n" ) - block = _WebVTTCueBlock.parse(raw) + block = WebVTTCueBlock.parse(raw) assert str(block.timings) == "04:05.001 --> 04:07.800" assert len(block.payload) == 3 - assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator) - assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) assert block.payload[0].component.text == "Sur les " - assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator) - assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan) + assert isinstance(block.payload[1], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[1].component, WebVTTCueItalicSpan) assert len(block.payload[1].component.internal_text.components) == 1 lang_span = block.payload[1].component.internal_text.components[0].component - assert isinstance(lang_span, _WebVTTCueLanguageSpan) + assert isinstance(lang_span, WebVTTCueLanguageSpan) assert isinstance( - lang_span.internal_text.components[0].component, _WebVTTCueTextSpan + lang_span.internal_text.components[0].component, WebVTTCueTextSpan ) assert lang_span.internal_text.components[0].component.text == "playground" - assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator) - assert isinstance(block.payload[2].component, _WebVTTCueTextSpan) + assert isinstance(block.payload[2], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[2].component, WebVTTCueTextSpan) assert block.payload[2].component.text == ", ici à Montpellier" assert raw == str(block) @@ -228,26 +228,26 @@ def test_webvtt_file() -> None: """Test WebVTT files.""" with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: content = f.read() - vtt = _WebVTTFile.parse(content) + vtt = WebVTTFile.parse(content) assert len(vtt) == 13 block = vtt.cue_blocks[11] assert str(block.timings) == "00:32.500 --> 00:33.500" assert len(block.payload) == 1 cue_span = block.payload[0] - assert isinstance(cue_span.component, _WebVTTCueVoiceSpan) + assert isinstance(cue_span.component, WebVTTCueVoiceSpan) assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson" assert not cue_span.component.start_tag.classes assert len(cue_span.component.internal_text.components) == 1 comp = cue_span.component.internal_text.components[0] - assert isinstance(comp.component, _WebVTTCueItalicSpan) + assert isinstance(comp.component, WebVTTCueItalicSpan) assert len(comp.component.internal_text.components) == 1 comp2 = comp.component.internal_text.components[0] - assert isinstance(comp2.component, _WebVTTCueTextSpan) + assert isinstance(comp2.component, WebVTTCueTextSpan) assert comp2.component.text == "Laughs" with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: content = f.read() - vtt = _WebVTTFile.parse(content) + vtt = WebVTTFile.parse(content) assert len(vtt) == 4 reverse = ( "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " @@ -258,7 +258,7 @@ def test_webvtt_file() -> None: with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: content = f.read() - vtt = _WebVTTFile.parse(content) + vtt = WebVTTFile.parse(content) assert len(vtt) == 13 for block in vtt: assert block.identifier @@ -266,20 +266,20 @@ def test_webvtt_file() -> None: assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" assert len(block.payload) == 1 - assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan) + assert isinstance(block.payload[0].component, WebVTTCueVoiceSpan) block = vtt.cue_blocks[2] assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" assert len(block.payload) == 1 - assert isinstance(block.payload[0].component, _WebVTTCueTextSpan) + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) assert block.payload[0].component.text == "Good." def test_webvtt_cue_language_span_start_tag(): - _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}') - _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}') - _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}') with pytest.raises(ValidationError, match="BCP 47"): - _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}') with pytest.raises(ValidationError, match="BCP 47"): - _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}') From 82e80c0c2a6643dd5a17457a65f8736d86207cca Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 12 Dec 2025 17:05:16 +0100 Subject: [PATCH 09/20] chore(webvtt): preserve newlines as WebVTTLineTerminator Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 38 ++++++++++++++++++-------- test/data/webvtt/webvtt_example_04.vtt | 13 +++++++++ test/test_webvtt.py | 13 +++++++++ 3 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 test/data/webvtt/webvtt_example_04.vtt diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 550498a9..023b0192 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -2,6 +2,7 @@ import logging import re +from collections.abc import Iterator from enum import Enum from typing import Annotated, ClassVar, Literal, Optional, Union @@ -17,13 +18,15 @@ _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] -class _WebVTTLineTerminator(str, Enum): +class WebVTTLineTerminator(str, Enum): + """WebVTT line terminator.""" + CRLF = "\r\n" LF = "\n" CR = "\r" -_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] +WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] class WebVTTTimestamp(BaseModel): @@ -137,7 +140,7 @@ class WebVTTCueComponentWithTerminator(BaseModel): """WebVTT caption or subtitle cue component optionally with a line terminator.""" component: "WebVTTCueComponent" - terminator: Optional[_WebVTTLineTerminator] = None + terminator: Optional[WebVTTLineTerminator] = None @override def __str__(self): @@ -148,7 +151,7 @@ def __str__(self): class WebVTTCueInternalText(BaseModel): """WebVTT cue internal text.""" - terminator: Optional[_WebVTTLineTerminator] = None + terminator: Optional[WebVTTLineTerminator] = None components: Annotated[ list[WebVTTCueComponentWithTerminator], Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")), @@ -339,7 +342,7 @@ class WebVTTCueBlock(BaseModel): model_config = ConfigDict(regex_engine="python-re") - identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier") + identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier") timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")] payload: Annotated[ list[WebVTTCueComponentWithTerminator], @@ -363,6 +366,19 @@ def validate_payload(cls, payload): raise ValueError("Cue payload must not contain '-->'") return payload + @staticmethod + def _create_text_components( + text: str, + ) -> Iterator[WebVTTCueComponentWithTerminator]: + text_list = text.split("\n") + for idx, line in enumerate(text.split("\n")): + terminator = WebVTTLineTerminator.LF if idx < len(text_list) - 1 or text.endswith("\n") else None + if len(line) > 0: + yield WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text=line), + terminator=terminator, + ) + @classmethod def parse(cls, raw: str) -> "WebVTTCueBlock": """Parse a WebVTT cue block from a string. @@ -376,7 +392,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock": lines = raw.strip().splitlines() if not lines: raise ValueError("Cue block must have at least one line") - identifier: Optional[_WebVTTCueIdentifier] = None + identifier: Optional[WebVTTCueIdentifier] = None timing_line = lines[0] if "-->" not in timing_line and len(lines) > 1: identifier = timing_line @@ -391,7 +407,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock": start, end = [t.strip() for t in timing_line.split("-->")] end = re.split(" |\t", end)[0] # ignore the cue settings list timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end)) - cue_text = " ".join(cue_lines).strip() + cue_text = "\n".join(cue_lines).strip() # adding close tag for cue spans without end tag for omm in {"v"}: if cue_text.startswith(f"<{omm}") and f"" not in cue_text: @@ -407,9 +423,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock": while i < len(matches): match = matches[i] if match.start() > pos: - stack[-1].append( - WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()])) - ) + text = cue_text[pos : match.start()] + stack[-1].extend(cls._create_text_components(text)) gps = {k: (v if v else None) for k, v in match.groupdict().items()} if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}: @@ -454,7 +469,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock": i += 1 if pos < len(cue_text): - stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:]))) + text = cue_text[pos:] + stack[-1].extend(cls._create_text_components(text)) return cls( identifier=identifier, diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt new file mode 100644 index 00000000..91be3530 --- /dev/null +++ b/test/data/webvtt/webvtt_example_04.vtt @@ -0,0 +1,13 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:01.000 --> 00:04.000 +Never drink liquid nitrogen. + +NOTE I’m not sure the timing is right on the following cue. + +00:05.000 --> 00:09.000 +— It will perforate your stomach. +— You could die. +This is true. \ No newline at end of file diff --git a/test/test_webvtt.py b/test/test_webvtt.py index 9e47f1a8..1bf9edb8 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -274,6 +274,19 @@ def test_webvtt_file() -> None: assert isinstance(block.payload[0].component, WebVTTCueTextSpan) assert block.payload[0].component.text == "Good." + with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f: + content = f.read() + vtt = WebVTTFile.parse(content) + assert len(vtt) == 2 + block = vtt.cue_blocks[1] + assert len(block.payload) == 5 + assert str(block) == ( + "00:05.000 --> 00:09.000\n" + "— It will perforate your stomach.\n" + "— You could die.\n" + "This is true.\n" + ) + def test_webvtt_cue_language_span_start_tag(): WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}') From 5721f099345ca5e014074140888227e6573a3609 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Sun, 14 Dec 2025 23:52:36 +0100 Subject: [PATCH 10/20] refactor(webvtt): set ProvenanceTrack time fields as float Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/document.py | 20 ++++++++++----- docling_core/types/doc/webvtt.py | 2 +- docs/DoclingDocument.json | 41 ++++++++++-------------------- test/test_doc_base.py | 23 ++++++++++------- 4 files changed, 42 insertions(+), 44 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 6f02c54f..7f088389 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -67,7 +67,6 @@ ) from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import parse_otsl_table_content, relative_path -from docling_core.types.doc.webvtt import WebVTTTimestamp _logger = logging.getLogger(__name__) @@ -1178,17 +1177,17 @@ class ProvenanceTrack(BaseModel): """ start_time: Annotated[ - WebVTTTimestamp, + float, Field( - examples=["00.11.000", "00:00:06.500", "01:28:34.300"], - description="Start time offset of the track cue", + examples=[11.0, 6.5, 5370.0], + description="Start time offset of the track cue in seconds", ), ] end_time: Annotated[ - WebVTTTimestamp, + float, Field( - examples=["00.12.000", "00:00:08.200", "01:29:30.100"], - description="End time offset of the track cue", + examples=[12.0, 8.2, 5370.1], + description="End time offset of the track cue in seconds", ), ] identifier: Optional[str] = Field( @@ -1213,6 +1212,13 @@ class ProvenanceTrack(BaseModel): description="Classes for describing the cue significance", ) + @model_validator(mode="after") + def check_order(self) -> Self: + """Ensure start time is less than the end time.""" + if self.end_time <= self.start_time: + raise ValueError("End time must be greater than start time") + return self + def get_provenance_discriminator_value(v: Any) -> str: """Callable discriminator for provenance instances. diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 023b0192..30fa1a4f 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -95,7 +95,7 @@ class WebVTTCueTimings(BaseModel): @model_validator(mode="after") def check_order(self) -> Self: - """Ensure start timestamp is less than or equal to end timestamp.""" + """Ensure start timestamp is less than end timestamp.""" if self.start and self.end: if self.end.seconds <= self.start.seconds: raise ValueError("End timestamp must be greater than start timestamp") diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 35175601..45a5d889 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2228,22 +2228,24 @@ "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).", "properties": { "start_time": { - "$ref": "#/$defs/WebVTTTimestamp", - "description": "Start time offset of the track cue", + "description": "Start time offset of the track cue in seconds", "examples": [ - "00.11.000", - "00:00:06.500", - "01:28:34.300" - ] + 11.0, + 6.5, + 5370.0 + ], + "title": "Start Time", + "type": "number" }, "end_time": { - "$ref": "#/$defs/WebVTTTimestamp", - "description": "End time offset of the track cue", + "description": "End time offset of the track cue in seconds", "examples": [ - "00.12.000", - "00:00:08.200", - "01:29:30.100" - ] + 12.0, + 8.2, + 5370.1 + ], + "title": "End Time", + "type": "number" }, "identifier": { "anyOf": [ @@ -3190,21 +3192,6 @@ ], "title": "TitleItem", "type": "object" - }, - "WebVTTTimestamp": { - "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.", - "properties": { - "raw": { - "description": "A representation of the WebVTT Timestamp as a single string", - "title": "Raw", - "type": "string" - } - }, - "required": [ - "raw" - ], - "title": "WebVTTTimestamp", - "type": "object" } }, "description": "DoclingDocument.", diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 18d2cf11..2d1ce498 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -1,8 +1,7 @@ import pytest from pydantic import ValidationError -from docling_core.types.doc.document import ProvenanceTrack -from docling_core.types.doc.webvtt import WebVTTTimestamp +from docling_core.types.doc import ProvenanceTrack from docling_core.types.legacy_doc.base import Prov, S3Reference @@ -45,8 +44,8 @@ def test_prov_track(): """Test the class ProvenanceTrack.""" valid_track = ProvenanceTrack( - start_time=WebVTTTimestamp(raw="00:11.000"), - end_time=WebVTTTimestamp(raw="00:12.000"), + start_time=11.0, + end_time=12.0, identifier="test", voice="Mary", languages=["en", "en-GB"], @@ -54,19 +53,25 @@ def test_prov_track(): ) assert valid_track - assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000") - assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000") + assert valid_track.start_time == 11.0 + assert valid_track.end_time == 12.0 assert valid_track.identifier == "test" assert valid_track.voice == "Mary" assert valid_track.languages == ["en", "en-GB"] assert valid_track.classes == ["v.first.loud", "i.foreignphrase"] with pytest.raises(ValidationError, match="end_time"): - ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000")) + ProvenanceTrack(start_time=11.0) with pytest.raises(ValidationError, match="should be a valid list"): ProvenanceTrack( - start_time=WebVTTTimestamp(raw="00:11.000"), - end_time=WebVTTTimestamp(raw="00:12.000"), + start_time=11.0, + end_time=12.0, languages="en", ) + + with pytest.raises(ValidationError, match="must be greater than start"): + ProvenanceTrack( + start_time=11.0, + end_time=11.0, + ) From 134cf959d3b742096ca26ba679464242b0378435 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Mon, 15 Dec 2025 22:13:16 +0100 Subject: [PATCH 11/20] chore(webvtt): ensure start time offsets are in sequence Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 34 +++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 30fa1a4f..bf5b7227 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -4,6 +4,7 @@ import re from collections.abc import Iterator from enum import Enum +from functools import total_ordering from typing import Annotated, ClassVar, Literal, Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -29,6 +30,7 @@ class WebVTTLineTerminator(str, Enum): WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] +@total_ordering class WebVTTTimestamp(BaseModel): """WebVTT timestamp. @@ -81,6 +83,18 @@ def seconds(self) -> float: """A representation of the WebVTT Timestamp in seconds.""" return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0 + def __eq__(self, other: object) -> bool: + """Two timestamps are equal if their total number of seconds is equal.""" + if not isinstance(other, WebVTTTimestamp): + return NotImplemented + return self.seconds == other.seconds + + def __lt__(self, other: "WebVTTTimestamp") -> bool: + """Return True if this timestamp occurs before `other`.""" + if not isinstance(other, WebVTTTimestamp): + return NotImplemented + return self.seconds < other.seconds + @override def __str__(self) -> str: """Return a string representation of a WebVTT timestamp.""" @@ -97,7 +111,7 @@ class WebVTTCueTimings(BaseModel): def check_order(self) -> Self: """Ensure start timestamp is less than end timestamp.""" if self.start and self.end: - if self.end.seconds <= self.start.seconds: + if self.end <= self.start: raise ValueError("End timestamp must be greater than start timestamp") return self @@ -512,6 +526,24 @@ def verify_signature(content: str) -> bool: else: return False + @model_validator(mode="after") + def validate_start_time(self) -> Self: + """Validate cue start times. + + The start time offset of the cue must be greater than or equal to the start + time offsets of all previous cues. + """ + idx: int = 0 + while idx < (len(self.cue_blocks) - 1): + if self.cue_blocks[idx + 1].timings.start < self.cue_blocks[idx].timings.start: + raise ValueError( + f"The start time offset of block {idx + 1} must be greater than or" + " equal to the start time offsets of all previous cues in the file" + ) + idx += 1 + + return self + @classmethod def parse(cls, raw: str) -> "WebVTTFile": """Parse a WebVTT file. From 3983b4456b1c72f7bc59aacf67c96393363e040f Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Tue, 16 Dec 2025 17:04:20 +0100 Subject: [PATCH 12/20] chore(webvtt): improve regex to remove note,region,style blocks Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 11 ++++------- test/data/webvtt/webvtt_example_04.vtt | 20 ++++++++++++++++++++ test/test_webvtt.py | 6 +++++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index bf5b7227..c4f7336f 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -1,7 +1,7 @@ """Models for the Docling's adoption of Web Video Text Tracks format.""" -import logging import re +import warnings from collections.abc import Iterator from enum import Enum from functools import total_ordering @@ -11,9 +11,6 @@ from pydantic.types import StringConstraints from typing_extensions import Self, override -_log = logging.getLogger(__name__) - - _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);") _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] @@ -512,6 +509,7 @@ def __str__(self): class WebVTTFile(BaseModel): """A model representing a WebVTT file.""" + _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)") cue_blocks: list[WebVTTCueBlock] @staticmethod @@ -566,8 +564,7 @@ def parse(cls, raw: str) -> "WebVTTFile": body = lines[1] if len(lines) > 1 else "" # Remove NOTE/STYLE/REGION blocks - body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE) - body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE) + body = re.sub(cls._pattern, "", body) # Split into cue blocks raw_blocks = re.split(r"\n\s*\n", body.strip()) @@ -576,7 +573,7 @@ def parse(cls, raw: str) -> "WebVTTFile": try: cues.append(WebVTTCueBlock.parse(block)) except ValueError as e: - _log.warning(f"Failed to parse cue block:\n{block}\n{e}") + warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning) return cls(cue_blocks=cues) diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt index 91be3530..b0519be2 100644 --- a/test/data/webvtt/webvtt_example_04.vtt +++ b/test/data/webvtt/webvtt_example_04.vtt @@ -2,6 +2,26 @@ WEBVTT NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ +STYLE +::cue { + background-image: linear-gradient(to bottom, dimgray, lightgray); + color: papayawhip; +} +/* Style blocks cannot use blank lines nor "dash dash greater than" */ + +REGION +id:editor-comments +width: 40% +regionanchor:0%,100% +viewportanchor:10%,90% + +REGION +id:scroll +width: 40% +regionanchor:100%,100% +viewportanchor:90%,90% +scroll:up + 00:01.000 --> 00:04.000 Never drink liquid nitrogen. diff --git a/test/test_webvtt.py b/test/test_webvtt.py index 1bf9edb8..51f448ed 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -5,6 +5,8 @@ Copyright © 2019 World Wide Web Consortium. """ +import warnings + import pytest from pydantic import ValidationError @@ -276,7 +278,9 @@ def test_webvtt_file() -> None: with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f: content = f.read() - vtt = WebVTTFile.parse(content) + with warnings.catch_warnings(): + warnings.simplefilter("error") + vtt = WebVTTFile.parse(content) assert len(vtt) == 2 block = vtt.cue_blocks[1] assert len(block.payload) == 5 From ff30e427a9235ef669a43a14b83eea79f6a9506d Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Tue, 16 Dec 2025 17:23:34 +0100 Subject: [PATCH 13/20] chore(webvtt): parse the WebVTT file title Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/webvtt.py | 4 +++- test/data/webvtt/webvtt_example_04.vtt | 2 +- test/test_webvtt.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index c4f7336f..6b4eba1f 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -510,6 +510,7 @@ class WebVTTFile(BaseModel): """A model representing a WebVTT file.""" _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)") + title: Optional[str] = None cue_blocks: list[WebVTTCueBlock] @staticmethod @@ -561,6 +562,7 @@ def parse(cls, raw: str) -> "WebVTTFile": # Strip "WEBVTT" header line lines = raw.split("\n", 1) + title = lines[0].removeprefix("WEBVTT").strip() or None body = lines[1] if len(lines) > 1 else "" # Remove NOTE/STYLE/REGION blocks @@ -575,7 +577,7 @@ def parse(cls, raw: str) -> "WebVTTFile": except ValueError as e: warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning) - return cls(cue_blocks=cues) + return cls(title=title, cue_blocks=cues) def __iter__(self): """Return an iterator over the cue blocks.""" diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt index b0519be2..78b5ba0c 100644 --- a/test/data/webvtt/webvtt_example_04.vtt +++ b/test/data/webvtt/webvtt_example_04.vtt @@ -1,4 +1,4 @@ -WEBVTT +WEBVTT Danger of Nitrogen NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ diff --git a/test/test_webvtt.py b/test/test_webvtt.py index 51f448ed..a3443fd2 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -275,6 +275,7 @@ def test_webvtt_file() -> None: assert len(block.payload) == 1 assert isinstance(block.payload[0].component, WebVTTCueTextSpan) assert block.payload[0].component.text == "Good." + assert not vtt.title with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f: content = f.read() @@ -290,6 +291,7 @@ def test_webvtt_file() -> None: "— You could die.\n" "This is true.\n" ) + assert vtt.title == "Danger of Nitrogen" def test_webvtt_cue_language_span_start_tag(): From 6da51be58118aaf805579d07f61a80d41f381c45 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Tue, 6 Jan 2026 15:47:34 +0100 Subject: [PATCH 14/20] chore(webvtt): rebase to latest changes in idoctags Signed-off-by: Cesar Berrospi Ramis --- docling_core/experimental/idoctags.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py index dd19d7f0..7376062b 100644 --- a/docling_core/experimental/idoctags.py +++ b/docling_core/experimental/idoctags.py @@ -175,6 +175,8 @@ def _create_location_tokens_for_item( return "" out: list[str] = [] for prov in item.prov: + if not isinstance(prov, ProvenanceItem): + continue page_w, page_h = doc.pages[prov.page_no].size.as_tuple() bbox = prov.bbox.to_top_left_origin(page_h).as_tuple() out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres)) @@ -1379,12 +1381,14 @@ def serialize( # we will need to do something more complex I believe ... res: list[SerializationResult] = [] for idp, prov_ in enumerate(item.prov): - item_ = copy.deepcopy(item) + if not isinstance(prov_, ProvenanceItem): + continue + item_: TextItem = copy.deepcopy(item) item_.prov = [prov_] item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]] # it must be `orig`, not `text` here! item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]] - - item_.prov[0].charspan = (0, len(item_.orig)) + if isinstance(item_.prov[0], ProvenanceItem): + item_.prov[0].charspan = (0, len(item_.orig)) # marker field should be cleared on subsequent split parts if idp > 0 and isinstance(item_, ListItem): @@ -1748,7 +1752,7 @@ def _emit_otsl( if params.add_table_cell_location: # Check if we have all required information for location serialization - if item.prov and len(item.prov) > 0: + if item.prov and isinstance(item.prov[0], ProvenanceItem): page_no = item.prov[0].page_no if doc.pages and page_no in doc.pages: page_w, page_h = doc.pages[page_no].size.as_tuple() @@ -1897,6 +1901,8 @@ def serialize( for it, _ in doc.iterate_items(root=item): if isinstance(it, DocItem) and it.prov: for prov in it.prov: + if not isinstance(prov, ProvenanceItem): + continue page_w, page_h = doc.pages[prov.page_no].size.as_tuple() boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple()) prov_page_w_h = (page_w, page_h, prov.page_no) From 0a9e190ccee54d3f55c90c2889f0980f6a5ea2ab Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Mon, 19 Jan 2026 18:54:07 +0100 Subject: [PATCH 15/20] feat(webvtt): add WebVTT serializer Add a DoclingDocument serializer to WebVTT format. Improve WebVTT data model. Signed-off-by: Cesar Berrospi Ramis --- docling_core/transforms/serializer/common.py | 3 +- docling_core/transforms/serializer/webvtt.py | 545 +++++++++++++++++++ docling_core/types/doc/document.py | 6 +- docling_core/types/doc/webvtt.py | 150 ++++- docs/DoclingDocument.json | 6 +- test/data/doc/webvtt_example_01.gt.vtt | 40 ++ test/data/doc/webvtt_example_01.json | 313 +++++++++++ test/data/doc/webvtt_example_02.gt.vtt | 16 + test/data/doc/webvtt_example_02.json | 272 +++++++++ test/data/doc/webvtt_example_03.gt.vtt | 57 ++ test/data/doc/webvtt_example_03.json | 406 ++++++++++++++ test/data/doc/webvtt_example_04.gt.vtt | 9 + test/data/doc/webvtt_example_04.json | 194 +++++++ test/data/doc/webvtt_example_05.gt.vtt | 10 + test/data/doc/webvtt_example_05.json | 344 ++++++++++++ test/test_serialization.py | 25 + test/test_webvtt.py | 7 +- 17 files changed, 2372 insertions(+), 31 deletions(-) create mode 100644 docling_core/transforms/serializer/webvtt.py create mode 100644 test/data/doc/webvtt_example_01.gt.vtt create mode 100644 test/data/doc/webvtt_example_01.json create mode 100644 test/data/doc/webvtt_example_02.gt.vtt create mode 100644 test/data/doc/webvtt_example_02.json create mode 100644 test/data/doc/webvtt_example_03.gt.vtt create mode 100644 test/data/doc/webvtt_example_03.json create mode 100644 test/data/doc/webvtt_example_04.gt.vtt create mode 100644 test/data/doc/webvtt_example_04.json create mode 100644 test/data/doc/webvtt_example_05.gt.vtt create mode 100644 test/data/doc/webvtt_example_05.json diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index c9c497f4..c36062e0 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -324,7 +324,7 @@ def serialize_doc( parts: list[SerializationResult], **kwargs: Any, ) -> SerializationResult: - """Serialize a document out of its pages.""" + """Serialize a document out of its parts.""" ... def _serialize_body(self, **kwargs) -> SerializationResult: @@ -355,7 +355,6 @@ def serialize( empty_res = create_ser_result() my_item = item or self.doc.body - if my_item == self.doc.body: if my_item.meta and not self._meta_is_wrapped(): meta_part = self.serialize_meta(item=my_item, **my_kwargs) diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py new file mode 100644 index 00000000..15fdbc3b --- /dev/null +++ b/docling_core/transforms/serializer/webvtt.py @@ -0,0 +1,545 @@ +"""Define classes for WebVTT serialization.""" + +import logging +import re +from typing import Any, Optional, get_args + +from pydantic import BaseModel +from typing_extensions import override + +from docling_core.transforms.serializer.base import ( + BaseAnnotationSerializer, + BaseDocSerializer, + BaseFallbackSerializer, + BaseFormSerializer, + BaseInlineSerializer, + BaseKeyValueSerializer, + BaseListSerializer, + BaseMetaSerializer, + BasePictureSerializer, + BaseTableSerializer, + BaseTextSerializer, + SerializationResult, +) +from docling_core.transforms.serializer.common import ( + CommonParams, + DocSerializer, + create_ser_result, +) +from docling_core.types.doc.document import ( + ContentLayer, + DocItem, + DocItemLabel, + DoclingDocument, + Formatting, + FormItem, + InlineGroup, + KeyValueItem, + ListGroup, + NodeItem, + PictureItem, + ProvenanceTrack, + TableItem, + TextItem, + TitleItem, +) +from docling_core.types.doc.webvtt import ( + START_TAG_NAMES, + WebVTTCueBlock, + WebVTTCueSpanStartTag, + WebVTTCueSpanStartTagAnnotated, + WebVTTCueTimings, + WebVTTFile, + WebVTTLineTerminator, + WebVTTTimestamp, +) + +_logger = logging.getLogger(__name__) + + +def _remove_consecutive_pairs(text: str) -> str: + """Remove one pass of consecutive start/end tag pairs. + + This function looks for patterns like where the tags are identical + and removes them. It handles two cases: + 1. Direct adjacent tags with content: contentwhitespace + 2. Tags with other tags in between: + + Args: + text: Input string + + Returns: + String with one pass of consecutive pairs removed + """ + # Pattern 1: Direct adjacent tags with same classes and annotations + pattern1 = re.compile( + r"<([bciuv]|lang)((?:\.\w+)*)(?:\s+([^>]+))?>" # Opening tag: capture tag, classes, annotation + r"((?:(?!).)*?)" # Content (non-greedy, not containing the closing tag) + r"" # Closing tag + r"(\s*)" # Capture whitespace between tags (including newlines) + r"<\1((?:\.\w+)*)(?:\s+([^>]+))?>" # Next opening tag: capture classes and annotation + ) + + def replacer1(match: re.Match[str]) -> str: + tag = match.group(1) + classes1 = match.group(2) or "" + anno1 = match.group(3) or "" + content = match.group(4) + whitespace = match.group(5) # Whitespace between tags + classes2 = match.group(6) or "" + anno2 = match.group(7) or "" + + # Only merge if classes and annotations match + if classes1 == classes2 and anno1 == anno2: + # Merge: remove the closing and opening tags, but keep the whitespace + return f"<{tag}{classes1}{' ' + anno1 if anno1 else ''}>{content}{whitespace}" + else: + # Don't merge - return original + return match.group(0) + + # Pattern 2: Tags with other tags in between + # This removes redundant and when there's another tag in between + pattern2 = re.compile( + r"" # Closing tag + r"(<[^>]+>)" # Any other tag in between + r"<\1(?:\.\w+)*(?:\s+[^>]+)?>" # Same opening tag (with any classes/annotations) + ) + + def replacer2(match: re.Match[str]) -> str: + # Just keep the middle tag, remove the closing and opening of the same type + return match.group(2) + + result = pattern1.sub(replacer1, text) + result = pattern2.sub(replacer2, result) + + return result + + +class WebVTTParams(CommonParams): + """Serialization parameters for the Web Video Text Tracks (WebVTT) format.""" + + layers: set[ContentLayer] = {ContentLayer.BODY} + + +class WebVTTTextSerializer(BaseModel, BaseTextSerializer): + """Text serializer to Web Video Text Tracks (WebVTT) format.""" + + @override + def serialize( + self, + *, + item: TextItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + is_inline_scope: bool = False, + visited: Optional[set[str]] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + # Handle TitleItem specially - it doesn't have provenance but we need its text + if isinstance(item, TitleItem): + return create_ser_result(text=item.text, span_source=item) + + # Only process items with ProvenanceTrack (WebVTT cues) + if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack): + return create_ser_result() + + # Apply post-processing here: formatting, classes, language, and voice + # If the TextItem is part of an InlineGroup, we need to further post-process it + # within the group context + + prov: ProvenanceTrack = item.prov[0] + text: str = doc_serializer.post_process( + text=item.text, + formatting=item.formatting, + voice=prov.voice, + languages=prov.languages, + classes=prov.classes, + ) + if is_inline_scope: + # Iteratively remove unnecessary consecutive tag pairs until no more changes + prev_text: Optional[str] = None + while prev_text != text: + prev_text = text + text = _remove_consecutive_pairs(text) + + return create_ser_result(text=text, span_source=item) + + +class _WebVTTTableSerializer(BaseTableSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: TableItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTPictureSerializer(BasePictureSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: PictureItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTKeyValueSerializer(BaseKeyValueSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: KeyValueItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTFormSerializer(BaseFormSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: FormItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTFallbackSerializer(BaseFallbackSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTListSerializer(BaseModel, BaseListSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: ListGroup, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + list_level: int = 0, + is_inline_scope: bool = False, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs) + return create_ser_result() + + +class WebVTTInlineSerializer(BaseInlineSerializer): + """Inline group serializer to Web Video Text Tracks (WebVTT) format.""" + + @override + def serialize( + self, + *, + item: InlineGroup, + doc_serializer: "BaseDocSerializer", + doc: DoclingDocument, + list_level: int = 0, + visited: Optional[set[str]] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serializes an inline group to WebVTT format.""" + _ = doc + my_visited = visited if visited is not None else set() + parts = doc_serializer.get_parts( + item=item, + list_level=list_level, + is_inline_scope=True, + visited=my_visited, + **kwargs, + ) + # Include all parts, even if text is empty or whitespace-only + # Use 'is not None' instead of truthiness check to preserve whitespace + text_res = "".join([p.text for p in parts if p.text is not None]) + + # Apply tag normalization to the concatenated result + # Iteratively remove consecutive pairs until no more changes + prev_text = None + while prev_text != text_res: + prev_text = text_res + text_res = _remove_consecutive_pairs(text_res) + + return create_ser_result(text=text_res, span_source=parts) + + +class _WebVTTMetaSerializer(BaseModel, BaseMetaSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, item, kwargs) + return create_ser_result() + + +class _WebVTTAnnotationSerializer(BaseModel, BaseAnnotationSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: DocItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, item, kwargs) + return create_ser_result() + + +class WebVTTDocSerializer(DocSerializer): + """Document serializer to Web Video Text Tracks (WebVTT) format.""" + + text_serializer: BaseTextSerializer = WebVTTTextSerializer() + table_serializer: BaseTableSerializer = _WebVTTTableSerializer() + picture_serializer: BasePictureSerializer = _WebVTTPictureSerializer() + key_value_serializer: BaseKeyValueSerializer = _WebVTTKeyValueSerializer() + form_serializer: BaseFormSerializer = _WebVTTFormSerializer() + fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer() + list_serializer: BaseListSerializer = _WebVTTListSerializer() + inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer() + meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer() + annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer() + + params: CommonParams = CommonParams() + + @override + def requires_page_break(self) -> bool: + """Whether to add page breaks. + + WebVTT format does not support page breaks. + """ + return False + + @override + def serialize_bold(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific bold serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("b", []) + + return self.serialize_cue_span( + text, + tag="b", + css=classes, + ) + + @override + def serialize_italic(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific italic serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("i", []) + + return self.serialize_cue_span( + text, + tag="i", + css=classes, + ) + + @override + def serialize_underline(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific underline serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("u", []) + + return self.serialize_cue_span( + text, + tag="u", + css=classes, + ) + + def serialize_cue_span( + self, + text: str, + tag: START_TAG_NAMES, + anno: Optional[str] = None, + css: list[str] = [], + ) -> str: + """Apply serialization to a WebVTT cue span.""" + start_tag: WebVTTCueSpanStartTag + if tag in {"b", "i", "u", "c"}: + start_tag = WebVTTCueSpanStartTag(name=tag, classes=css) + elif tag in {"v", "lang"}: + if not anno: + _logger.warning(f"Invalid {tag} cue span without annotation: {text}") + return text + else: + start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno) + else: + return text + + res: str = f"{start_tag}{text}" + return res + + @staticmethod + def _extract_classes(classes: list[str]) -> dict[str, list[str]]: + """Extract tag and values from provenance classes. + + Args: + classes: The classes from a ProvenanceTrack object. + + Returns: + Map of tag to class values. + """ + res: dict[str, list[str]] = {} + for item in classes or []: + for prefix in get_args(START_TAG_NAMES): + if item == prefix: + res[prefix] = [] + break + elif item.startswith(prefix + "."): + cls_str: str = item[len(prefix) + 1 :] + res[prefix] = cls_str.split(".") + break + return res + + @override + def serialize_doc( + self, + *, + parts: list[SerializationResult], + **kwargs: Any, + ) -> SerializationResult: + """Serialize a document out of its parts.""" + title: Optional[str] = None + + timings: Optional[WebVTTCueTimings] = None + id: Optional[str] = None + text: str = "" + cue_blocks: list[WebVTTCueBlock] = [] + for part in parts: + if not part.text or not part.spans: + continue + + # Get the doc item from the first span + doc_item: DocItem = part.spans[0].item + + # Handle title items (check both TitleItem type and label) + if isinstance(doc_item, TitleItem) or ( + isinstance(doc_item, TextItem) and doc_item.label == DocItemLabel.TITLE + ): + title = part.text + continue + if isinstance(doc_item, InlineGroup) and doc_item.children: + doc_item = doc_item.children[0].resolve(doc=self.doc) + if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack): + prov: ProvenanceTrack = doc_item.prov[0] + if ( + prov.identifier == id + and timings + and timings.start.seconds == prov.start_time + and timings.end.seconds == prov.end_time + ): + # When combining items with same timing, add newline and merge consecutive tags + combined = text.rstrip() + WebVTTLineTerminator.LF.value + part.text + # Use _remove_consecutive_pairs to merge tags like \n + # Iteratively remove consecutive pairs until no more changes + prev_combined = None + while prev_combined != combined: + prev_combined = combined + combined = _remove_consecutive_pairs(combined) + text = combined + WebVTTLineTerminator.LF.value + else: + if text: + cue_blocks.append(WebVTTCueBlock.parse(text)) + timings = WebVTTCueTimings( + start=WebVTTTimestamp.from_seconds(prov.start_time), + end=WebVTTTimestamp.from_seconds(prov.end_time), + ) + id = prov.identifier + text = ( + f"{id + WebVTTLineTerminator.LF.value if id else ''}{timings}" + f"{WebVTTLineTerminator.LF.value}{part.text}" + f"{WebVTTLineTerminator.LF.value}" + ) + if text: + cue_blocks.append(WebVTTCueBlock.parse(text)) + + webvtt_file = WebVTTFile(title=title, cue_blocks=cue_blocks) + content = str(webvtt_file) + return create_ser_result(text=content, span_source=parts) + + def post_process( + self, + text: str, + formatting: Optional[Formatting] = None, + voice: Optional[str] = None, + languages: Optional[list[str]] = None, + classes: Optional[list[str]] = None, + **kwargs: Any, + ) -> str: + """Apply some text post-processing steps by adding formatting tags. + + The order of the formatting tags is determined by this function and `DocSerializer.post_process`, + from the innermost to the outermost: + 1. language () + 2. underline () + 3. italic () + 4. bold () + 5. class () + 6. voice () + """ + res: str = text + cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {} + + for lang in languages or []: + res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", [])) + + res = super().post_process(text=res, formatting=formatting, classes=cls) + + if "c" in cls: + res = self.serialize_cue_span( + text=res, + tag="c", + css=cls.get("c", []), + ) + if voice: + res = self.serialize_cue_span( + text=res, + tag="v", + anno=voice, + css=cls.get("v", []), + ) + + return res diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 7f088389..dc0dbbf2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1208,7 +1208,7 @@ class ProvenanceTrack(BaseModel): classes: Optional[list[str]] = Field( None, min_length=1, - examples=["first", "loud", "yellow"], + examples=["b.first", "v.loud", "c.yellow"], description="Classes for describing the cue significance", ) @@ -1220,7 +1220,7 @@ def check_order(self) -> Self: return self -def get_provenance_discriminator_value(v: Any) -> str: +def _get_provenance_discriminator_value(v: Any) -> str: """Callable discriminator for provenance instances. Args: @@ -1237,7 +1237,7 @@ def get_provenance_discriminator_value(v: Any) -> str: ProvenanceType = Annotated[ Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]], - Discriminator(get_provenance_discriminator_value), + Discriminator(_get_provenance_discriminator_value), ] diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 6b4eba1f..6bc4a219 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -13,7 +13,7 @@ _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);") -_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] +START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] class WebVTTLineTerminator(str, Enum): @@ -80,6 +80,23 @@ def seconds(self) -> float: """A representation of the WebVTT Timestamp in seconds.""" return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0 + @classmethod + def from_seconds(cls, seconds: float) -> Self: + """Create a WebVTT timestamp from seconds. + + Args: + seconds: The time in seconds (can include fractional seconds for milliseconds). + + Returns: + A WebVTT timestamp instance. + """ + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + millis: int = round((seconds % 1) * 1000) + + return cls(raw=f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}") + def __eq__(self, other: object) -> bool: """Two timestamps are equal if their total number of seconds is equal.""" if not isinstance(other, WebVTTTimestamp): @@ -92,9 +109,27 @@ def __lt__(self, other: "WebVTTTimestamp") -> bool: return NotImplemented return self.seconds < other.seconds + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the timestamp as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0. + + Returns: + Formatted timestamp string. + """ + if omit_hours_if_zero and self._hours == 0: + return f"{self._minutes:02d}:{self._seconds:02d}.{self._millis:03d}" + return self.raw + @override def __str__(self) -> str: - """Return a string representation of a WebVTT timestamp.""" + """Return a string representation of a WebVTT timestamp. + + Always returns the full timestamp format including hours (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation (MM:SS.mmm) when hours are zero. + """ return self.raw @@ -112,9 +147,27 @@ def check_order(self) -> Self: raise ValueError("End timestamp must be greater than start timestamp") return self + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the cue timings as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in both timestamps. + + Returns: + Formatted cue timings string in the format "start --> end". + """ + start_str = self.start.format(omit_hours_if_zero=omit_hours_if_zero) + end_str = self.end.format(omit_hours_if_zero=omit_hours_if_zero) + return f"{start_str} --> {end_str}" + @override - def __str__(self): - """Return a string representation of the cue timings.""" + def __str__(self) -> str: + """Return a string representation of the cue timings. + + Always returns the full format including hours (HH:MM:SS.mmm --> HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + """ return f"{self.start} --> {self.end}" @@ -142,7 +195,7 @@ def is_valid_text(cls, value: str) -> str: return value @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue text span.""" return self.text @@ -154,7 +207,7 @@ class WebVTTCueComponentWithTerminator(BaseModel): terminator: Optional[WebVTTLineTerminator] = None @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue component with terminator.""" return f"{self.component}{self.terminator.value if self.terminator else ''}" @@ -169,7 +222,7 @@ class WebVTTCueInternalText(BaseModel): ] = [] @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue internal text.""" cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}" return cue_str @@ -178,7 +231,7 @@ def __str__(self): class WebVTTCueSpanStartTag(BaseModel): """WebVTT cue span start tag.""" - name: Annotated[_START_TAG_NAMES, Field(description="The tag name")] + name: Annotated[START_TAG_NAMES, Field(description="The tag name")] classes: Annotated[ list[str], Field(description="List of classes representing the cue span's significance"), @@ -200,7 +253,7 @@ def _get_name_with_classes(self) -> str: return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue span start tag.""" return f"<{self._get_name_with_classes()}>" @@ -228,7 +281,7 @@ def is_valid_annotation(cls, value: str) -> str: return value @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue span start tag.""" return f"<{self._get_name_with_classes()} {self.annotation}>" @@ -270,7 +323,7 @@ def check_tag_names_match(self) -> Self: return self @override - def __str__(self): + def __str__(self) -> str: """Return a string representation of the cue component.""" return f"{self.start_tag}{self.internal_text}" @@ -391,7 +444,7 @@ def _create_text_components( ) @classmethod - def parse(cls, raw: str) -> "WebVTTCueBlock": + def parse(cls, raw: str) -> Self: """Parse a WebVTT cue block from a string. Args: @@ -489,29 +542,50 @@ def parse(cls, raw: str) -> "WebVTTCueBlock": payload=stack[0], ) - def __str__(self): - """Return a string representation of the WebVTT cue block.""" + def format(self, omit_hours_if_zero: bool = False, omit_voice_end: bool = False) -> str: + """Format the WebVTT cue block as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in the timings. + omit_voice_end: If True and this cue block has a WebVTT cue voice span as + its only component, omit the voice end tag for brevity. + + Returns: + Formatted cue block string. + """ parts = [] if self.identifier: parts.append(f"{self.identifier}\n") - timings_line = str(self.timings) + timings_line = self.timings.format(omit_hours_if_zero=omit_hours_if_zero) parts.append(timings_line + "\n") for idx, span in enumerate(self.payload): - if idx == 0 and len(self.payload) == 1 and span.component.kind == "v": - # the end tag may be omitted for brevity + if omit_voice_end and idx == 0 and len(self.payload) == 1 and span.component.kind == "v": parts.append(str(span).removesuffix("")) else: parts.append(str(span)) return "".join(parts) + "\n" + def __str__(self) -> str: + """Return a string representation of the WebVTT cue block. + + Always returns the full format including hours in timestamps (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + Always returns the WebVTT cue voice spans with the voice end tag, even if this + cue block has a WebVTT cue voice span as a single component in the payload. Use + `format(omit_voice_end=True)` to get a shorter representation without the voice + end tag. + """ + return self.format() + class WebVTTFile(BaseModel): """A model representing a WebVTT file.""" _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)") - title: Optional[str] = None cue_blocks: list[WebVTTCueBlock] + title: Optional[str] = None @staticmethod def verify_signature(content: str) -> bool: @@ -544,7 +618,7 @@ def validate_start_time(self) -> Self: return self @classmethod - def parse(cls, raw: str) -> "WebVTTFile": + def parse(cls, raw: str) -> Self: """Parse a WebVTT file. Args: @@ -579,14 +653,46 @@ def parse(cls, raw: str) -> "WebVTTFile": return cls(title=title, cue_blocks=cues) - def __iter__(self): + def __iter__(self) -> Iterator[WebVTTCueBlock]: # type: ignore[override] """Return an iterator over the cue blocks.""" return iter(self.cue_blocks) - def __getitem__(self, idx): + def __getitem__(self, idx) -> WebVTTCueBlock: """Return the cue block at the given index.""" return self.cue_blocks[idx] - def __len__(self): + def __len__(self) -> int: """Return the number of cue blocks.""" return len(self.cue_blocks) + + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the WebVTT file as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in the timings. + + Returns: + Formatted WebVTT file string. + """ + parts: list[str] = [] + + if self.title: + parts.append(f"WEBVTT {self.title}\n") + else: + parts.append("WEBVTT\n") + + for cue_block in self.cue_blocks: + parts.append("\n") + parts.append(cue_block.format(omit_hours_if_zero=omit_hours_if_zero)) + + # Remove the trailing newline from the last cue block + return "".join(parts).rstrip("\n") + + def __str__(self) -> str: + """Return a string representation of the WebVTT file. + + Always returns the full format including hours in timestamps (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + """ + return self.format() diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 45a5d889..cea39ba5 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2324,9 +2324,9 @@ "default": null, "description": "Classes for describing the cue significance", "examples": [ - "first", - "loud", - "yellow" + "b.first", + "v.loud", + "c.yellow" ], "title": "Classes" } diff --git a/test/data/doc/webvtt_example_01.gt.vtt b/test/data/doc/webvtt_example_01.gt.vtt new file mode 100644 index 00000000..cad1c72a --- /dev/null +++ b/test/data/doc/webvtt_example_01.gt.vtt @@ -0,0 +1,40 @@ +WEBVTT + +00:00:11.000 --> 00:00:13.000 +We are in New York City + +00:00:13.000 --> 00:00:16.000 +We’re actually at the Lucern Hotel, just down the street + +00:00:16.000 --> 00:00:18.000 +from the American Museum of Natural History + +00:00:18.000 --> 00:00:20.000 +And with me is Neil deGrasse Tyson + +00:00:20.000 --> 00:00:22.000 +Astrophysicist, Director of the Hayden Planetarium + +00:00:22.000 --> 00:00:24.000 +at the AMNH. + +00:00:24.000 --> 00:00:26.000 +Thank you for walking down here. + +00:00:27.000 --> 00:00:30.000 +And I want to do a follow-up on the last conversation we did. + +00:00:30.000 --> 00:00:31.500 +When we e-mailed— + +00:00:30.500 --> 00:00:32.500 +Didn’t we talk about enough in that conversation? + +00:00:32.000 --> 00:00:35.500 +No! No no no no; 'cos 'cos obviously 'cos + +00:00:32.500 --> 00:00:33.500 +Laughs + +00:00:35.500 --> 00:00:38.000 +You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json new file mode 100644 index 00000000..5a7c9d29 --- /dev/null +++ b/test/data/doc/webvtt_example_01.json @@ -0,0 +1,313 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_01", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 16887312431371817791, + "filename": "webvtt_example_01.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 11.0, + "end_time": 13.0, + "voice": "Roger Bingham" + } + ], + "orig": "We are in New York City", + "text": "We are in New York City" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 13.0, + "end_time": 16.0, + "voice": "Roger Bingham" + } + ], + "orig": "We’re actually at the Lucern Hotel, just down the street", + "text": "We’re actually at the Lucern Hotel, just down the street" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 16.0, + "end_time": 18.0, + "voice": "Roger Bingham" + } + ], + "orig": "from the American Museum of Natural History", + "text": "from the American Museum of Natural History" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 18.0, + "end_time": 20.0, + "voice": "Roger Bingham" + } + ], + "orig": "And with me is Neil deGrasse Tyson", + "text": "And with me is Neil deGrasse Tyson" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 20.0, + "end_time": 22.0, + "voice": "Roger Bingham" + } + ], + "orig": "Astrophysicist, Director of the Hayden Planetarium", + "text": "Astrophysicist, Director of the Hayden Planetarium" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 22.0, + "end_time": 24.0, + "voice": "Roger Bingham" + } + ], + "orig": "at the AMNH.", + "text": "at the AMNH." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 24.0, + "end_time": 26.0, + "voice": "Roger Bingham" + } + ], + "orig": "Thank you for walking down here.", + "text": "Thank you for walking down here." + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 27.0, + "end_time": 30.0, + "voice": "Roger Bingham" + } + ], + "orig": "And I want to do a follow-up on the last conversation we did.", + "text": "And I want to do a follow-up on the last conversation we did." + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 30.0, + "end_time": 31.5, + "voice": "Roger Bingham" + } + ], + "orig": "When we e-mailed—", + "text": "When we e-mailed—" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 30.5, + "end_time": 32.5, + "voice": "Neil deGrasse Tyson" + } + ], + "orig": "Didn’t we talk about enough in that conversation?", + "text": "Didn’t we talk about enough in that conversation?" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 32.0, + "end_time": 35.5, + "voice": "Roger Bingham" + } + ], + "orig": "No! No no no no; 'cos 'cos obviously 'cos", + "text": "No! No no no no; 'cos 'cos obviously 'cos" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 32.5, + "end_time": 33.5, + "voice": "Neil deGrasse Tyson" + } + ], + "orig": "Laughs", + "text": "Laughs", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 35.5, + "end_time": 38.0, + "voice": "Roger Bingham" + } + ], + "orig": "You know I’m so excited my glasses are falling off here.", + "text": "You know I’m so excited my glasses are falling off here." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt new file mode 100644 index 00000000..8f9811e7 --- /dev/null +++ b/test/data/doc/webvtt_example_02.gt.vtt @@ -0,0 +1,16 @@ +WEBVTT + +00:00:00.000 --> 00:00:02.000 +It’s a blue apple tree! + +00:00:02.000 --> 00:00:04.000 +No way! + +00:00:04.000 --> 00:00:06.000 +Hee! laughter + +00:00:06.000 --> 00:00:08.000 +That’s awesome! + +00:00:08.000 --> 00:00:10.000 +Sur les playground, ici à Montpellier \ No newline at end of file diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json new file mode 100644 index 00000000..2966a2e0 --- /dev/null +++ b/test/data/doc/webvtt_example_02.json @@ -0,0 +1,272 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_02", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 8584853280299071027, + "filename": "webvtt_example_02.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 0.0, + "end_time": 2.0, + "voice": "Esme", + "classes": [ + "v.first.loud" + ] + } + ], + "orig": "It\u2019s a blue apple tree!", + "text": "It\u2019s a blue apple tree!" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 2.0, + "end_time": 4.0, + "voice": "Mary" + } + ], + "orig": "No way!", + "text": "No way!" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0, + "voice": "Esme" + } + ], + "orig": "Hee!", + "text": "Hee!" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": " ", + "text": " " + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": "laughter", + "text": "laughter", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 6.0, + "end_time": 8.0, + "voice": "Mary", + "classes": [ + "v.loud" + ] + } + ], + "orig": "That\u2019s awesome!", + "text": "That\u2019s awesome!" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": "Sur les ", + "text": "Sur les " + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0, + "languages": [ + "en" + ], + "classes": [ + "i.foreignphrase" + ] + } + ], + "orig": "playground", + "text": "playground", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": ", ici \u00e0 Montpellier", + "text": ", ici \u00e0 Montpellier" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_03.gt.vtt b/test/data/doc/webvtt_example_03.gt.vtt new file mode 100644 index 00000000..a4dc1291 --- /dev/null +++ b/test/data/doc/webvtt_example_03.gt.vtt @@ -0,0 +1,57 @@ +WEBVTT + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +00:00:04.963 --> 00:00:08.571 +OK, +I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 +00:00:08.571 --> 00:00:09.403 +properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 +00:00:10.683 --> 00:00:11.563 +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 +00:00:13.363 --> 00:00:13.803 +Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 +00:00:49.603 --> 00:00:53.363 +I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 +00:00:54.963 --> 00:01:02.072 +Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 +00:01:02.072 --> 00:01:06.811 +some metadata, +some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 +00:01:10.243 --> 00:01:13.014 +Yeah, +I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 +00:01:10.563 --> 00:01:12.643 +But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +00:01:13.014 --> 00:01:15.907 +than me. +I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +00:01:50.222 --> 00:01:51.643 +give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +00:01:52.043 --> 00:01:55.043 +Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +00:01:54.603 --> 00:01:55.283 +See you. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json new file mode 100644 index 00000000..dddce0f2 --- /dev/null +++ b/test/data/doc/webvtt_example_03.json @@ -0,0 +1,406 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_03", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 11620880316586573676, + "filename": "webvtt_example_03.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "voice": "Speaker A" + } + ], + "orig": "OK,", + "text": "OK," + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "voice": "Speaker A" + } + ], + "orig": "I think now we should be recording", + "text": "I think now we should be recording" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 8.571, + "end_time": 9.403, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", + "voice": "Speaker A" + } + ], + "orig": "properly.", + "text": "properly." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 10.683, + "end_time": 11.563, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + } + ], + "orig": "Good.", + "text": "Good." + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 13.363, + "end_time": 13.803, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", + "voice": "Speaker A" + } + ], + "orig": "Yeah.", + "text": "Yeah." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 49.603, + "end_time": 53.363, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", + "voice": "Speaker B" + } + ], + "orig": "I was also thinking.", + "text": "I was also thinking." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 54.963, + "end_time": 62.072, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", + "voice": "Speaker B" + } + ], + "orig": "Would be maybe good to create items,", + "text": "Would be maybe good to create items," + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "voice": "Speaker B" + } + ], + "orig": "some metadata,", + "text": "some metadata," + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "voice": "Speaker B" + } + ], + "orig": "some options that can be specific.", + "text": "some options that can be specific." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "voice": "Speaker A" + } + ], + "orig": "Yeah,", + "text": "Yeah," + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "voice": "Speaker A" + } + ], + "orig": "I mean I think you went even more than", + "text": "I mean I think you went even more than" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 70.563, + "end_time": 72.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", + "voice": "Speaker B" + } + ], + "orig": "But we preserved the atoms.", + "text": "But we preserved the atoms." + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "voice": "Speaker A" + } + ], + "orig": "than me.", + "text": "than me." + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "voice": "Speaker A" + } + ], + "orig": "I just opened the format.", + "text": "I just opened the format." + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 110.222, + "end_time": 111.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", + "voice": "Speaker A" + } + ], + "orig": "give it a try, yeah.", + "text": "give it a try, yeah." + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 112.043, + "end_time": 115.043, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", + "voice": "Speaker B" + } + ], + "orig": "Okay, talk to you later.", + "text": "Okay, talk to you later." + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 114.603, + "end_time": 115.283, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", + "voice": "Speaker A" + } + ], + "orig": "See you.", + "text": "See you." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt new file mode 100644 index 00000000..ce7fcf65 --- /dev/null +++ b/test/data/doc/webvtt_example_04.gt.vtt @@ -0,0 +1,9 @@ +WEBVTT Danger of Nitrogen + +00:00:01.000 --> 00:00:04.000 +Never drink liquid nitrogen. + +00:00:05.000 --> 00:00:09.000 +— It will perforate your stomach. +— You could die. +This is true. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json new file mode 100644 index 00000000..f96765fc --- /dev/null +++ b/test/data/doc/webvtt_example_04.json @@ -0,0 +1,194 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_04", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 11822397499369478441, + "filename": "webvtt_example_04.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "Danger of Nitrogen", + "text": "Danger of Nitrogen" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 1.0, + "end_time": 4.0 + } + ], + "orig": "Never drink liquid nitrogen.", + "text": "Never drink liquid nitrogen." + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": "\u2014 It will perforate your stomach.", + "text": "\u2014 It will perforate your stomach." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": "\u2014 You could ", + "text": "\u2014 You could " + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 5.0, + "end_time": 9.0, + "classes": [ + "b.loud" + ] + } + ], + "orig": "die", + "text": "die", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": ".", + "text": "." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 5.0, + "end_time": 9.0, + "voice": "John" + } + ], + "orig": "This is true.", + "text": "This is true." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt new file mode 100644 index 00000000..fd7b788c --- /dev/null +++ b/test/data/doc/webvtt_example_05.gt.vtt @@ -0,0 +1,10 @@ +WEBVTT + +agcvs-08234 +04:03:00.000 --> 04:06:00.000 +Last night the chef surprised us with a culinary adventure. + +agcvs-08234 +04:06:00.000 --> 04:06:58.239 +The waiter offered a steaming bowl of paella that instantly transported the diners to a sunny Mediterranean coast. +The dessert’s unexpected arcobaleno of flavors left everyone in awe. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json new file mode 100644 index 00000000..616c94fc --- /dev/null +++ b/test/data/doc/webvtt_example_05.json @@ -0,0 +1,344 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_04", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 5389775195091554844, + "filename": "webvtt_example_04.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14580.0, + "end_time": 14760.0, + "identifier": "agcvs-08234" + } + ], + "orig": "Last night the chef surprised us with a culinary adventure.", + "text": "Last night the chef surprised us with a culinary adventure." + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The waiter offered a ", + "text": "The waiter offered a " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "steaming bowl of ", + "text": "steaming bowl of ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "languages": [ + "es-ES" + ] + } + ], + "orig": "paella", + "text": "paella", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " that instantly transported the diners to a sunny Mediterranean coast.", + "text": " that instantly transported the diners to a sunny Mediterranean coast." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The dessert\u2019s ", + "text": "The dessert\u2019s " + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "classes": [ + "b.loud" + ] + } + ], + "orig": "unexpected", + "text": "unexpected", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " ", + "text": " ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "languages": [ + "it" + ] + } + ], + "orig": "arcobaleno", + "text": "arcobaleno", + "formatting": { + "bold": false, + "italic": true, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " of flavors", + "text": " of flavors", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " left everyone in awe.", + "text": " left everyone in awe." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/test_serialization.py b/test/test_serialization.py index 6fe3b386..fd68a347 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -15,6 +15,7 @@ MarkdownParams, OrigListItemMarkerMode, ) +from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( @@ -563,3 +564,27 @@ def test_html_inline_and_formatting(): ser = HTMLDocSerializer(doc=doc) actual = ser.serialize().text verify(exp_file=src.with_suffix(".gt.html"), actual=actual) + + +# =============================== +# WebVTT tests +# =============================== + + +@pytest.mark.parametrize( + "file_name", + [ + "webvtt_example_01", + "webvtt_example_02", + "webvtt_example_03", + "webvtt_example_04", + "webvtt_example_05", + ], +) +def test_webvtt(file_name): + src = Path(f"./test/data/doc/{file_name}.json") + doc = DoclingDocument.load_from_json(src) + + ser = WebVTTDocSerializer(doc=doc) + actual = ser.serialize().text + verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual) diff --git a/test/test_webvtt.py b/test/test_webvtt.py index a3443fd2..938da37c 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -255,7 +255,12 @@ def test_webvtt_file() -> None: "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " "https://www.w3.org/TR/webvtt1/\n\n" ) - reverse += "\n".join([str(block) for block in vtt.cue_blocks]) + reverse += "\n".join( + [ + block.format(omit_hours_if_zero=True, omit_voice_end=True) + for block in vtt.cue_blocks + ] + ) assert content == reverse.rstrip() with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: From 0b24861ee1d700c098aca894714c25ee125988df Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Mon, 19 Jan 2026 20:04:17 +0100 Subject: [PATCH 16/20] fix(webvtt): add 'text/vtt' as extra mimetype Add 'text/vtt' as extra MIME type to support WebVTT serialization, since it is not supported by 'mimetypes' with python < 3.11 Signed-off-by: Cesar Berrospi Ramis --- docling_core/types/doc/document.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index dc0dbbf2..82e71751 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -960,6 +960,7 @@ class DocumentOrigin(BaseModel): "text/asciidoc", "text/markdown", "text/csv", + "text/vtt", "audio/x-wav", "audio/wav", "audio/mp3", From 5e0a7870f842ce82b8ca09eed5264013e728ded3 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Thu, 22 Jan 2026 18:21:22 +0100 Subject: [PATCH 17/20] refactor(webvtt): roll back DocItem.prov as list of ProvenanceItem Signed-off-by: Cesar Berrospi Ramis --- docling_core/experimental/idoctags.py | 14 +- docling_core/transforms/serializer/azure.py | 11 +- docling_core/transforms/serializer/common.py | 15 +- docling_core/transforms/serializer/doctags.py | 6 +- docling_core/transforms/serializer/webvtt.py | 14 +- .../visualizer/key_value_visualizer.py | 5 +- .../visualizer/layout_visualizer.py | 3 +- .../visualizer/reading_order_visualizer.py | 3 +- .../transforms/visualizer/table_visualizer.py | 11 +- docling_core/types/doc/__init__.py | 4 +- docling_core/types/doc/document.py | 174 ++++--- docling_core/types/doc/webvtt.py | 2 +- docling_core/utils/legacy.py | 3 - docs/DoclingDocument.json | 467 +++++++++++------- test/data/doc/webvtt_example_01.json | 39 +- test/data/doc/webvtt_example_02.json | 27 +- test/data/doc/webvtt_example_03.json | 51 +- test/data/doc/webvtt_example_04.json | 18 +- test/data/doc/webvtt_example_05.json | 33 +- test/test_deserializer_idoctags.py | 4 +- test/test_doc_base.py | 12 +- test/test_serialization_doctag.py | 3 +- test/test_serialization_idoctag.py | 22 +- test/test_webvtt.py | 1 - 24 files changed, 553 insertions(+), 389 deletions(-) diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py index 7376062b..dd19d7f0 100644 --- a/docling_core/experimental/idoctags.py +++ b/docling_core/experimental/idoctags.py @@ -175,8 +175,6 @@ def _create_location_tokens_for_item( return "" out: list[str] = [] for prov in item.prov: - if not isinstance(prov, ProvenanceItem): - continue page_w, page_h = doc.pages[prov.page_no].size.as_tuple() bbox = prov.bbox.to_top_left_origin(page_h).as_tuple() out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres)) @@ -1381,14 +1379,12 @@ def serialize( # we will need to do something more complex I believe ... res: list[SerializationResult] = [] for idp, prov_ in enumerate(item.prov): - if not isinstance(prov_, ProvenanceItem): - continue - item_: TextItem = copy.deepcopy(item) + item_ = copy.deepcopy(item) item_.prov = [prov_] item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]] # it must be `orig`, not `text` here! item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]] - if isinstance(item_.prov[0], ProvenanceItem): - item_.prov[0].charspan = (0, len(item_.orig)) + + item_.prov[0].charspan = (0, len(item_.orig)) # marker field should be cleared on subsequent split parts if idp > 0 and isinstance(item_, ListItem): @@ -1752,7 +1748,7 @@ def _emit_otsl( if params.add_table_cell_location: # Check if we have all required information for location serialization - if item.prov and isinstance(item.prov[0], ProvenanceItem): + if item.prov and len(item.prov) > 0: page_no = item.prov[0].page_no if doc.pages and page_no in doc.pages: page_w, page_h = doc.pages[page_no].size.as_tuple() @@ -1901,8 +1897,6 @@ def serialize( for it, _ in doc.iterate_items(root=item): if isinstance(it, DocItem) and it.prov: for prov in it.prov: - if not isinstance(prov, ProvenanceItem): - continue page_w, page_h = doc.pages[prov.page_no].size.as_tuple() boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple()) prov_page_w_h = (page_w, page_h, prov.page_no) diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py index ed91aee2..1addf996 100644 --- a/docling_core/transforms/serializer/azure.py +++ b/docling_core/transforms/serializer/azure.py @@ -55,7 +55,6 @@ ListGroup, NodeItem, PictureItem, - ProvenanceItem, RefItem, RichTableCell, TableItem, @@ -77,7 +76,7 @@ def _bbox_to_polygon_coords( def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]: """Compute a TOPLEFT-origin polygon for the first provenance of the item.""" - if not item.prov or not isinstance(item.prov[0], ProvenanceItem): + if not item.prov: return None prov = item.prov[0] @@ -188,7 +187,7 @@ def serialize( # Lists may be represented either as TextItem(ListItem) or via groups; # we treat any TextItem as a paragraph-like entry. - if item.prov and isinstance(item.prov[0], ProvenanceItem): + if item.prov: prov = item.prov[0] page_no = prov.page_no polygon = _bbox_to_polygon_for_item(doc, item) @@ -238,7 +237,7 @@ def serialize( ) -> SerializationResult: assert isinstance(doc_serializer, AzureDocSerializer) - if not item.prov or not isinstance(item.prov[0], ProvenanceItem): + if not item.prov: return create_ser_result() prov = item.prov[0] @@ -309,7 +308,7 @@ def serialize( ) -> SerializationResult: assert isinstance(doc_serializer, AzureDocSerializer) - if not item.prov or not isinstance(item.prov[0], ProvenanceItem): + if not item.prov: return create_ser_result() prov = item.prov[0] @@ -325,7 +324,7 @@ def serialize( for foot_ref in item.footnotes: if isinstance(foot_ref, RefItem): tgt = foot_ref.resolve(doc) - if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem): + if isinstance(tgt, TextItem) and tgt.prov: f_poly = _bbox_to_polygon_for_item(doc, tgt) if f_poly is not None: foots.append( diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index c36062e0..43bfd54b 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -52,7 +52,6 @@ PictureDataType, PictureItem, PictureMoleculeData, - ProvenanceItem, Script, TableAnnotationType, TableItem, @@ -109,7 +108,7 @@ def _iterate_items( add_page_breaks=add_page_breaks, visited=my_visited, ): - if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem): + if isinstance(it, DocItem) and it.prov: page_no = it.prov[0].page_no if prev_page_nr is not None and page_no > prev_page_nr: yield ( @@ -121,7 +120,7 @@ def _iterate_items( lvl, ) break - elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem): + elif isinstance(item, DocItem) and item.prov: page_no = item.prov[0].page_no if prev_page_nr is None or page_no > prev_page_nr: if prev_page_nr is not None: # close previous range @@ -302,13 +301,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: or item.content_layer not in params.layers or ( params.pages is not None - and ( - (not item.prov) - or ( - isinstance(item.prov[0], ProvenanceItem) - and item.prov[0].page_no not in params.pages - ) - ) + and ((not item.prov) or item.prov[0].page_no not in params.pages) ) ) ) @@ -355,6 +348,7 @@ def serialize( empty_res = create_ser_result() my_item = item or self.doc.body + if my_item == self.doc.body: if my_item.meta and not self._meta_is_wrapped(): meta_part = self.serialize_meta(item=my_item, **my_kwargs) @@ -677,7 +671,6 @@ def _get_applicable_pages(self) -> Optional[list[int]]: if ( isinstance(item, DocItem) and item.prov - and isinstance(item.prov[0], ProvenanceItem) and (self.params.pages is None or item.prov[0].page_no in self.params.pages) and ix >= self.params.start_idx and ix < self.params.stop_idx diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 16549652..dc8c520f 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -345,7 +345,7 @@ def serialize( results: list[SerializationResult] = [] page_no = 1 - if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem): + if len(item.prov) > 0: page_no = item.prov[0].page_no if params.add_location: @@ -363,7 +363,7 @@ def serialize( for cell in item.graph.cells: cell_txt = "" - if cell.prov is not None and isinstance(cell.prov, ProvenanceItem): + if cell.prov is not None: if len(doc.pages.keys()): page_w, page_h = doc.pages[page_no].size.as_tuple() cell_txt += DocumentToken.get_location( @@ -471,7 +471,7 @@ def _get_inline_location_tags( doc_items: list[DocItem] = [] for it, _ in doc.iterate_items(root=item): if isinstance(it, DocItem): - for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)): + for prov in it.prov: boxes.append(prov.bbox) doc_items.append(it) if prov is None: diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py index 15fdbc3b..bfd1fd55 100644 --- a/docling_core/transforms/serializer/webvtt.py +++ b/docling_core/transforms/serializer/webvtt.py @@ -38,10 +38,10 @@ ListGroup, NodeItem, PictureItem, - ProvenanceTrack, TableItem, TextItem, TitleItem, + TrackProvenance, ) from docling_core.types.doc.webvtt import ( START_TAG_NAMES, @@ -140,15 +140,15 @@ def serialize( if isinstance(item, TitleItem): return create_ser_result(text=item.text, span_source=item) - # Only process items with ProvenanceTrack (WebVTT cues) - if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack): + # Only process items with TrackProvenance (WebVTT cues) + if not item.text or not item.source or item.source[0].kind != "track": return create_ser_result() # Apply post-processing here: formatting, classes, language, and voice # If the TextItem is part of an InlineGroup, we need to further post-process it # within the group context - prov: ProvenanceTrack = item.prov[0] + prov: TrackProvenance = item.source[0] text: str = doc_serializer.post_process( text=item.text, formatting=item.formatting, @@ -417,7 +417,7 @@ def _extract_classes(classes: list[str]) -> dict[str, list[str]]: """Extract tag and values from provenance classes. Args: - classes: The classes from a ProvenanceTrack object. + classes: The classes from a TrackProvenance object. Returns: Map of tag to class values. @@ -463,8 +463,8 @@ def serialize_doc( continue if isinstance(doc_item, InlineGroup) and doc_item.children: doc_item = doc_item.children[0].resolve(doc=self.doc) - if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack): - prov: ProvenanceTrack = doc_item.prov[0] + if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track": + prov: TrackProvenance = doc_item.source[0] if ( prov.identifier == id and timings diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py index e2b10264..89b07f77 100644 --- a/docling_core/transforms/visualizer/key_value_visualizer.py +++ b/docling_core/transforms/visualizer/key_value_visualizer.py @@ -21,7 +21,6 @@ DoclingDocument, GraphCellLabel, GraphLinkLabel, - ProvenanceItem, ) # --------------------------------------------------------------------------- @@ -87,7 +86,7 @@ def _draw_key_value_layer( # First draw cells (rectangles + optional labels) # ------------------------------------------------------------------ for cell in cell_dict.values(): - if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no: + if cell.prov is None or cell.prov.page_no != page_no: continue # skip cells not on this page or without bbox tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height) @@ -154,8 +153,6 @@ def _draw_key_value_layer( if ( src_cell.prov is None or tgt_cell.prov is None - or not isinstance(src_cell.prov, ProvenanceItem) - or not isinstance(tgt_cell.prov, ProvenanceItem) or src_cell.prov.page_no != page_no or tgt_cell.prov.page_no != page_no ): diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py index 8ac6bf81..043fedac 100644 --- a/docling_core/transforms/visualizer/layout_visualizer.py +++ b/docling_core/transforms/visualizer/layout_visualizer.py @@ -17,7 +17,6 @@ DocItem, DocItemLabel, DoclingDocument, - ProvenanceItem, TextCell, ) @@ -179,7 +178,7 @@ def _draw_doc_layout( if len(elem.prov) == 0: continue # Skip elements without provenances - for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)): + for prov in elem.prov: page_nr = prov.page_no if page_nr in my_images: diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py index 27583613..60874333 100644 --- a/docling_core/transforms/visualizer/reading_order_visualizer.py +++ b/docling_core/transforms/visualizer/reading_order_visualizer.py @@ -14,7 +14,6 @@ DocItem, DoclingDocument, PictureItem, - ProvenanceItem, ) @@ -131,7 +130,7 @@ def _draw_doc_reading_order( if len(elem.prov) == 0: continue # Skip elements without provenances - for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)): + for prov in elem.prov: page_no = prov.page_no image = my_images.get(page_no) diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py index d3790d6b..5f601f9a 100644 --- a/docling_core/transforms/visualizer/table_visualizer.py +++ b/docling_core/transforms/visualizer/table_visualizer.py @@ -10,12 +10,7 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc import ( - ContentLayer, - DoclingDocument, - ProvenanceItem, - TableItem, -) +from docling_core.types.doc import ContentLayer, DoclingDocument, TableItem _log = logging.getLogger(__name__) @@ -190,10 +185,10 @@ def _draw_doc_tables( image = pil_img.copy() my_images[page_nr] = image - for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)): + for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)): if not isinstance(elem, TableItem): continue - if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem): + if len(elem.prov) == 0: continue # Skip elements without provenances if len(elem.prov) == 1: diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index d8ddd0b4..c3a2b237 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -46,6 +46,7 @@ PictureClassificationClass, PictureClassificationData, PictureClassificationMetaField, + PictureClassificationPrediction, PictureDataType, PictureItem, PictureLineChartData, @@ -56,7 +57,7 @@ PictureStackedBarChartData, PictureTabularChartData, ProvenanceItem, - ProvenanceTrack, + ProvenanceType, RefItem, RichTableCell, Script, @@ -69,6 +70,7 @@ TabularChartMetaField, TextItem, TitleItem, + TrackProvenance, UnorderedList, ) from .labels import ( diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 82e71751..e864574f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -32,12 +32,10 @@ AnyUrl, BaseModel, ConfigDict, - Discriminator, Field, FieldSerializationInfo, SerializerFunctionWrapHandler, StringConstraints, - Tag, computed_field, field_serializer, field_validator, @@ -1170,13 +1168,27 @@ class ProvenanceItem(BaseModel): charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")] -class ProvenanceTrack(BaseModel): - """Provenance information for elements extracted from media assets. +class BaseProvenance(BaseModel): + """Base class for provenance information. - A `ProvenanceTrack` instance describes a cue in a text track associated with a - media element (audio, video, subtitles, screen recordings, ...). + Represents the provenance of an extracted component within a digital asset. """ + kind: Annotated[ + str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.") + ] + + +class TrackProvenance(BaseProvenance): + """Provenance metadata for a cue extracted from a media track. + + A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions, + etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle + block, an audio clip, or a timed marker in a screen-recording. + """ + + kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track" + start_time: Annotated[ float, Field( @@ -1221,25 +1233,22 @@ def check_order(self) -> Self: return self -def _get_provenance_discriminator_value(v: Any) -> str: - """Callable discriminator for provenance instances. +ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")] +"""Union type for all provenance types. - Args: - v: Either dict or model input. - - Returns: - A string discriminator of provenance instances. - """ - fields = {"bbox", "page_no", "charspan"} - if isinstance(v, dict): - return "item" if any(f in v for f in fields) else "track" - return "item" if any(hasattr(v, f) for f in fields) else "track" +This type alias represents a discriminated union of all available provenance types that can be associated with +extracted elements in a document. The `kind` field is used as a discriminator to determine the specific +provenance type at runtime. +Currently supported provenance types: + - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles) -ProvenanceType = Annotated[ - Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]], - Discriminator(_get_provenance_discriminator_value), -] +Notes: + - Additional provenance types may be added to this union in the future to support + other content sources. + - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the + `ProvenanceItem` should still be used. +""" class ContentLayer(str, Enum): @@ -1544,20 +1553,28 @@ class FineRef(RefItem): range: Optional[tuple[int, int]] = None # start_inclusive, end_exclusive -class DocItem(NodeItem): # Base type for any element that carries content, can be a leaf node - """DocItem.""" +class DocItem(NodeItem): + """Base type for any element that carries content, can be a leaf node.""" label: DocItemLabel - prov: list[ProvenanceType] = [] + prov: list[ProvenanceItem] = [] + source: Annotated[ + list[ProvenanceType], + Field( + description="The provenance of this document item. Currently, it is only used for media track provenance." + ), + ] = [] comments: list[FineRef] = [] # References to comment items annotating this content @model_serializer(mode="wrap") def _custom_pydantic_serialize(self, handler: SerializerFunctionWrapHandler) -> dict: dumped = handler(self) - # suppress serializing comment list when empty: - if dumped.get("comments") == []: - del dumped["comments"] + # suppress serializing comment and source lists when empty: + for field in {"comments", "source"}: + if dumped.get(field) == []: + del dumped[field] + return dumped def get_location_tokens( @@ -1573,7 +1590,7 @@ def get_location_tokens( return "" location = "" - for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)): + for prov in self.prov: page_w, page_h = doc.pages[prov.page_no].size.as_tuple() loc_str = DocumentToken.get_location( @@ -1609,9 +1626,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL if not page_image: return None crop_bbox = ( - prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size( - old_size=page.size, new_size=page.image.size - ) + self.prov[prov_index] + .bbox.to_top_left_origin(page_height=page.size.height) + .scale_to_size(old_size=page.size, new_size=page.image.size) # .scaled(scale=page_image.height / page.size.height) ) return page_image.crop(crop_bbox.as_tuple()) @@ -2282,7 +2299,7 @@ def export_to_otsl( return "" page_no = 0 - if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem): + if len(self.prov) > 0: page_no = self.prov[0].page_no for i in range(nrows): @@ -2412,7 +2429,7 @@ class GraphCell(BaseModel): text: str # sanitized text orig: str # text as seen on document - prov: Optional[ProvenanceType] = None + prov: Optional[ProvenanceItem] = None # in case you have a text, table or picture item item_ref: Optional[RefItem] = None @@ -3061,7 +3078,7 @@ def add_list_item( enumerated: bool = False, marker: Optional[str] = None, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3072,7 +3089,7 @@ def add_list_item( :param label: str: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ @@ -3113,7 +3130,7 @@ def add_text( label: DocItemLabel, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3124,7 +3141,7 @@ def add_text( :param label: str: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ @@ -3250,7 +3267,7 @@ def add_table( self, data: TableData, caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet. - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, label: DocItemLabel = DocItemLabel.TABLE, content_layer: Optional[ContentLayer] = None, @@ -3260,7 +3277,7 @@ def add_table( :param data: TableData: :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) :param label: DocItemLabel: (Default value = DocItemLabel.TABLE) @@ -3296,7 +3313,7 @@ def add_picture( annotations: Optional[list[PictureDataType]] = None, image: Optional[ImageRef] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, ): @@ -3305,7 +3322,7 @@ def add_picture( :param data: Optional[list[PictureData]]: (Default value = None) :param caption: Optional[Union[TextItem: :param RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3337,7 +3354,7 @@ def add_title( self, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3348,7 +3365,7 @@ def add_title( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3383,7 +3400,7 @@ def add_code( code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3396,7 +3413,7 @@ def add_code( :param orig: Optional[str]: (Default value = None) :param caption: Optional[Union[TextItem: :param RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3434,7 +3451,7 @@ def add_formula( self, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3445,7 +3462,7 @@ def add_formula( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3479,7 +3496,7 @@ def add_heading( text: str, orig: Optional[str] = None, level: LevelNumber = 1, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, @@ -3491,7 +3508,7 @@ def add_heading( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3524,13 +3541,13 @@ def add_heading( def add_key_values( self, graph: GraphData, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, ): """add_key_values. :param graph: GraphData: - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3555,13 +3572,13 @@ def add_key_values( def add_form( self, graph: GraphData, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, ): """add_form. :param graph: GraphData: - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ if not parent: @@ -3750,7 +3767,7 @@ def insert_list_item( enumerated: bool = False, marker: Optional[str] = None, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -3763,7 +3780,7 @@ def insert_list_item( :param enumerated: bool: (Default value = False) :param marker: Optional[str]: (Default value = None) :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -3822,7 +3839,7 @@ def insert_text( label: DocItemLabel, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -3834,7 +3851,7 @@ def insert_text( :param label: DocItemLabel: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -3934,7 +3951,7 @@ def insert_table( sibling: NodeItem, data: TableData, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, label: DocItemLabel = DocItemLabel.TABLE, content_layer: Optional[ContentLayer] = None, annotations: Optional[list[TableAnnotationType]] = None, @@ -3945,7 +3962,7 @@ def insert_table( :param sibling: NodeItem: :param data: TableData: :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param label: DocItemLabel: (Default value = DocItemLabel.TABLE) :param content_layer: Optional[ContentLayer]: (Default value = None) :param annotations: Optional[list[TableAnnotationType]]: (Default value = None) @@ -3982,7 +3999,7 @@ def insert_picture( annotations: Optional[list[PictureDataType]] = None, image: Optional[ImageRef] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, after: bool = True, ) -> PictureItem: @@ -3992,7 +4009,7 @@ def insert_picture( :param annotations: Optional[list[PictureDataType]]: (Default value = None) :param image: Optional[ImageRef]: (Default value = None) :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param after: bool: (Default value = True) @@ -4026,7 +4043,7 @@ def insert_title( sibling: NodeItem, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4037,7 +4054,7 @@ def insert_title( :param sibling: NodeItem: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4077,7 +4094,7 @@ def insert_code( code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, caption: Optional[Union[TextItem, RefItem]] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4090,7 +4107,7 @@ def insert_code( :param code_language: Optional[str]: (Default value = None) :param orig: Optional[str]: (Default value = None) :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4132,7 +4149,7 @@ def insert_formula( sibling: NodeItem, text: str, orig: Optional[str] = None, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4143,7 +4160,7 @@ def insert_formula( :param sibling: NodeItem: :param text: str: :param orig: Optional[str]: (Default value = None) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4182,7 +4199,7 @@ def insert_heading( text: str, orig: Optional[str] = None, level: LevelNumber = 1, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, content_layer: Optional[ContentLayer] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -4194,7 +4211,7 @@ def insert_heading( :param text: str: :param orig: Optional[str]: (Default value = None) :param level: LevelNumber: (Default value = 1) - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param content_layer: Optional[ContentLayer]: (Default value = None) :param formatting: Optional[Formatting]: (Default value = None) :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None) @@ -4232,14 +4249,14 @@ def insert_key_values( self, sibling: NodeItem, graph: GraphData, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, after: bool = True, ) -> KeyValueItem: """Creates a new KeyValueItem item and inserts it into the document. :param sibling: NodeItem: :param graph: GraphData: - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param after: bool: (Default value = True) :returns: KeyValueItem: The newly created KeyValueItem item. @@ -4261,14 +4278,14 @@ def insert_form( self, sibling: NodeItem, graph: GraphData, - prov: Optional[ProvenanceType] = None, + prov: Optional[ProvenanceItem] = None, after: bool = True, ) -> FormItem: """Creates a new FormItem item and inserts it into the document. :param sibling: NodeItem: :param graph: GraphData: - :param prov: Optional[ProvenanceType]: (Default value = None) + :param prov: Optional[ProvenanceItem]: (Default value = None) :param after: bool: (Default value = True) :returns: FormItem: The newly created FormItem item. @@ -4605,10 +4622,7 @@ def _iterate_items_with_stack( (not isinstance(root, GroupItem) or with_groups) and ( not isinstance(root, DocItem) - or ( - page_nrs is None - or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem)) - ) + or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov)) ) and root.content_layer in my_layers ) @@ -4730,7 +4744,7 @@ def _with_pictures_refs( else: obj_path = loc_path - if item.image is None and isinstance(item.prov[0], ProvenanceItem): + if item.image is None: scale = img.size[0] / item.prov[0].bbox.width item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale)) elif item.image is not None: @@ -6136,7 +6150,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) -> if isinstance(new_item, DocItem): # update page numbers # NOTE other prov sources (e.g. GraphCell) currently not covered - for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)): + for prov in new_item.prov: prov.page_no += page_delta if item.parent: diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 6bc4a219..297e97fb 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -406,7 +406,7 @@ class WebVTTCueBlock(BaseModel): model_config = ConfigDict(regex_engine="python-re") - identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier") + identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")] payload: Annotated[ list[WebVTTCueComponentWithTerminator], diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py index 5ebac4be..26042436 100644 --- a/docling_core/utils/legacy.py +++ b/docling_core/utils/legacy.py @@ -165,7 +165,6 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f span=[0, len(item.text)], ) for p in item.prov - if isinstance(p, ProvenanceItem) ] main_text.append( BaseText( @@ -287,7 +286,6 @@ def _make_spans(cell: TableCell, table_item: TableItem): span=[0, 0], ) for p in item.prov - if isinstance(p, ProvenanceItem) ], ) ) @@ -315,7 +313,6 @@ def _make_spans(cell: TableCell, table_item: TableItem): span=[0, len(caption)], ) for p in item.prov - if isinstance(p, ProvenanceItem) ], obj_type=doc_item_label_to_legacy_type(item.label), text=caption, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index cea39ba5..b37260eb 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -233,16 +233,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -658,16 +670,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -807,16 +831,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -894,21 +930,13 @@ "prov": { "anyOf": [ { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" - }, - { - "$ref": "#/$defs/ProvenanceTrack" - } - ] + "$ref": "#/$defs/ProvenanceItem" }, { "type": "null" } ], - "default": null, - "title": "Prov" + "default": null }, "item_ref": { "anyOf": [ @@ -1227,16 +1255,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -1406,16 +1446,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -1789,16 +1841,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -2224,120 +2288,6 @@ "title": "ProvenanceItem", "type": "object" }, - "ProvenanceTrack": { - "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).", - "properties": { - "start_time": { - "description": "Start time offset of the track cue in seconds", - "examples": [ - 11.0, - 6.5, - 5370.0 - ], - "title": "Start Time", - "type": "number" - }, - "end_time": { - "description": "End time offset of the track cue in seconds", - "examples": [ - 12.0, - 8.2, - 5370.1 - ], - "title": "End Time", - "type": "number" - }, - "identifier": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "An identifier of the cue", - "examples": [ - "test", - "123", - "b72d946" - ], - "title": "Identifier" - }, - "voice": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The cue voice (speaker)", - "examples": [ - "Mary", - "Fred", - "Name Surname" - ], - "title": "Voice" - }, - "languages": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Languages of the cue in BCP 47 language tag format", - "examples": [ - [ - "en", - "en-GB" - ], - [ - "fr-CA" - ] - ], - "title": "Languages" - }, - "classes": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "minItems": 1, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Classes for describing the cue significance", - "examples": [ - "b.first", - "v.loud", - "c.yellow" - ], - "title": "Classes" - } - }, - "required": [ - "start_time", - "end_time" - ], - "title": "ProvenanceTrack", - "type": "object" - }, "RefItem": { "description": "RefItem.", "properties": { @@ -2494,16 +2444,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -2796,16 +2758,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -3008,16 +2982,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -3127,16 +3113,28 @@ "prov": { "default": [], "items": { - "oneOf": [ - { - "$ref": "#/$defs/ProvenanceItem" + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" }, + "propertyName": "kind" + }, + "oneOf": [ { - "$ref": "#/$defs/ProvenanceTrack" + "$ref": "#/$defs/TrackProvenance" } ] }, - "title": "Prov", + "title": "Source", "type": "array" }, "comments": { @@ -3192,6 +3190,127 @@ ], "title": "TitleItem", "type": "object" + }, + "TrackProvenance": { + "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.", + "properties": { + "kind": { + "const": "track", + "default": "track", + "description": "Identifiers this type of provenance.", + "title": "Kind", + "type": "string" + }, + "start_time": { + "description": "Start time offset of the track cue in seconds", + "examples": [ + 11.0, + 6.5, + 5370.0 + ], + "title": "Start Time", + "type": "number" + }, + "end_time": { + "description": "End time offset of the track cue in seconds", + "examples": [ + 12.0, + 8.2, + 5370.1 + ], + "title": "End Time", + "type": "number" + }, + "identifier": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An identifier of the cue", + "examples": [ + "test", + "123", + "b72d946" + ], + "title": "Identifier" + }, + "voice": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The cue voice (speaker)", + "examples": [ + "Mary", + "Fred", + "Name Surname" + ], + "title": "Voice" + }, + "languages": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Languages of the cue in BCP 47 language tag format", + "examples": [ + [ + "en", + "en-GB" + ], + [ + "fr-CA" + ] + ], + "title": "Languages" + }, + "classes": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "minItems": 1, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Classes for describing the cue significance", + "examples": [ + "b.first", + "v.loud", + "c.yellow" + ], + "title": "Classes" + } + }, + "required": [ + "start_time", + "end_time" + ], + "title": "TrackProvenance", + "type": "object" } }, "description": "DoclingDocument.", diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json index 5a7c9d29..78ce13b6 100644 --- a/test/data/doc/webvtt_example_01.json +++ b/test/data/doc/webvtt_example_01.json @@ -71,8 +71,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 11.0, "end_time": 13.0, "voice": "Roger Bingham" @@ -89,8 +90,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 13.0, "end_time": 16.0, "voice": "Roger Bingham" @@ -107,8 +109,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 16.0, "end_time": 18.0, "voice": "Roger Bingham" @@ -125,8 +128,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 18.0, "end_time": 20.0, "voice": "Roger Bingham" @@ -143,8 +147,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 20.0, "end_time": 22.0, "voice": "Roger Bingham" @@ -161,8 +166,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 22.0, "end_time": 24.0, "voice": "Roger Bingham" @@ -179,8 +185,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 24.0, "end_time": 26.0, "voice": "Roger Bingham" @@ -197,8 +204,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 27.0, "end_time": 30.0, "voice": "Roger Bingham" @@ -215,8 +223,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 30.0, "end_time": 31.5, "voice": "Roger Bingham" @@ -233,8 +242,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 30.5, "end_time": 32.5, "voice": "Neil deGrasse Tyson" @@ -251,8 +261,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 32.0, "end_time": 35.5, "voice": "Roger Bingham" @@ -269,8 +280,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 32.5, "end_time": 33.5, "voice": "Neil deGrasse Tyson" @@ -294,8 +306,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 35.5, "end_time": 38.0, "voice": "Roger Bingham" diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json index 2966a2e0..35c53692 100644 --- a/test/data/doc/webvtt_example_02.json +++ b/test/data/doc/webvtt_example_02.json @@ -88,8 +88,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 0.0, "end_time": 2.0, "voice": "Esme", @@ -109,8 +110,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 2.0, "end_time": 4.0, "voice": "Mary" @@ -127,8 +129,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 4.0, "end_time": 6.0, "voice": "Esme" @@ -145,8 +148,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 4.0, "end_time": 6.0 } @@ -162,8 +166,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 4.0, "end_time": 6.0 } @@ -186,8 +191,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 6.0, "end_time": 8.0, "voice": "Mary", @@ -207,8 +213,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 8.0, "end_time": 10.0 } @@ -224,8 +231,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 8.0, "end_time": 10.0, "languages": [ @@ -254,8 +262,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 8.0, "end_time": 10.0 } diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json index dddce0f2..42d9e5b2 100644 --- a/test/data/doc/webvtt_example_03.json +++ b/test/data/doc/webvtt_example_03.json @@ -83,8 +83,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 4.963, "end_time": 8.571, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", @@ -102,8 +103,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 4.963, "end_time": 8.571, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", @@ -121,8 +123,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 8.571, "end_time": 9.403, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", @@ -140,8 +143,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 10.683, "end_time": 11.563, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" @@ -158,8 +162,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 13.363, "end_time": 13.803, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", @@ -177,8 +182,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 49.603, "end_time": 53.363, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", @@ -196,8 +202,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 54.963, "end_time": 62.072, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", @@ -215,8 +222,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 62.072, "end_time": 66.811, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", @@ -234,8 +242,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 62.072, "end_time": 66.811, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", @@ -253,8 +262,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 70.243, "end_time": 73.014, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", @@ -272,8 +282,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 70.243, "end_time": 73.014, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", @@ -291,8 +302,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 70.563, "end_time": 72.643, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", @@ -310,8 +322,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 73.014, "end_time": 75.907, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", @@ -329,8 +342,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 73.014, "end_time": 75.907, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", @@ -348,8 +362,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 110.222, "end_time": 111.643, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", @@ -367,8 +382,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 112.043, "end_time": 115.043, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", @@ -386,8 +402,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 114.603, "end_time": 115.283, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json index f96765fc..7e12385d 100644 --- a/test/data/doc/webvtt_example_04.json +++ b/test/data/doc/webvtt_example_04.json @@ -80,8 +80,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 1.0, "end_time": 4.0 } @@ -97,8 +98,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 5.0, "end_time": 9.0 } @@ -114,8 +116,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 5.0, "end_time": 9.0 } @@ -131,8 +134,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 5.0, "end_time": 9.0, "classes": [ @@ -158,8 +162,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 5.0, "end_time": 9.0 } @@ -175,8 +180,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 5.0, "end_time": 9.0, "voice": "John" diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json index 616c94fc..9a53b3b0 100644 --- a/test/data/doc/webvtt_example_05.json +++ b/test/data/doc/webvtt_example_05.json @@ -94,8 +94,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14580.0, "end_time": 14760.0, "identifier": "agcvs-08234" @@ -112,8 +113,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -130,8 +132,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -155,8 +158,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", @@ -183,8 +187,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -201,8 +206,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -219,8 +225,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", @@ -247,8 +254,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -272,8 +280,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", @@ -300,8 +309,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" @@ -325,8 +335,9 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ + "source": [ { + "kind": "track", "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234" diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py index 58fb50db..28b41ad6 100644 --- a/test/test_deserializer_idoctags.py +++ b/test/test_deserializer_idoctags.py @@ -1,5 +1,4 @@ from pathlib import Path -from test.test_serialization_doctag import verify import pytest @@ -21,7 +20,8 @@ TableData, ) from docling_core.types.doc.labels import CodeLanguageLabel -from test.test_serialization_idoctag import add_texts_section, add_list_section +from test.test_serialization_doctag import verify +from test.test_serialization_idoctag import add_list_section, add_texts_section DO_PRINT: bool = False diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 2d1ce498..45a9445c 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -1,7 +1,7 @@ import pytest from pydantic import ValidationError -from docling_core.types.doc import ProvenanceTrack +from docling_core.types.doc import TrackProvenance from docling_core.types.legacy_doc.base import Prov, S3Reference @@ -41,9 +41,9 @@ def test_prov(): def test_prov_track(): - """Test the class ProvenanceTrack.""" + """Test the class TrackProvenance.""" - valid_track = ProvenanceTrack( + valid_track = TrackProvenance( start_time=11.0, end_time=12.0, identifier="test", @@ -61,17 +61,17 @@ def test_prov_track(): assert valid_track.classes == ["v.first.loud", "i.foreignphrase"] with pytest.raises(ValidationError, match="end_time"): - ProvenanceTrack(start_time=11.0) + TrackProvenance(start_time=11.0) with pytest.raises(ValidationError, match="should be a valid list"): - ProvenanceTrack( + TrackProvenance( start_time=11.0, end_time=12.0, languages="en", ) with pytest.raises(ValidationError, match="must be greater than start"): - ProvenanceTrack( + TrackProvenance( start_time=11.0, end_time=11.0, ) diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py index 45d0c983..9b378b03 100644 --- a/test/test_serialization_doctag.py +++ b/test/test_serialization_doctag.py @@ -6,8 +6,7 @@ DocTagsDocSerializer, DocTagsParams, ) -from docling_core.types.doc import DoclingDocument -from docling_core.types.doc.document import DoclingDocument, TableData +from docling_core.types.doc import DoclingDocument, TableData from docling_core.types.doc.labels import DocItemLabel from .test_serialization import verify diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py index 43aaa79e..1c0f8479 100644 --- a/test/test_serialization_idoctag.py +++ b/test/test_serialization_idoctag.py @@ -2,37 +2,39 @@ from pathlib import Path from typing import Optional -from test.test_serialization import verify import pytest from docling_core.experimental.idoctags import ( ContentType, - WrapMode, EscapeMode, IDocTagsDocSerializer, IDocTagsParams, IDocTagsSerializationMode, IDocTagsVocabulary, + WrapMode, ) from docling_core.types.doc import ( + BoundingBox, + CodeLanguageLabel, + CoordOrigin, + DescriptionMetaField, DocItemLabel, DoclingDocument, Formatting, - Script, - TableData, -) -from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size -from docling_core.types.doc.document import ( - DescriptionMetaField, + PictureClassificationLabel, PictureClassificationMetaField, PictureClassificationPrediction, PictureMeta, ProvenanceItem, + Script, + Size, SummaryMetaField, + TableData, TabularChartMetaField, ) -from docling_core.types.doc.labels import CodeLanguageLabel, PictureClassificationLabel +from test.test_serialization import verify + def add_texts_section(doc: DoclingDocument): doc.add_text(label=DocItemLabel.TEXT, text="Simple text") @@ -427,7 +429,7 @@ def test_content_allow_all_types(sample_doc: DoclingDocument): serializer = IDocTagsDocSerializer( doc=doc, params=IDocTagsParams( - content_types={ct for ct in ContentType}, + content_types=set(ContentType), ), ) ser_txt = serializer.serialize().text diff --git a/test/test_webvtt.py b/test/test_webvtt.py index 938da37c..5b1693e3 100644 --- a/test/test_webvtt.py +++ b/test/test_webvtt.py @@ -1,6 +1,5 @@ """Test the data model for WebVTT files. -Assisted by watsonx Code Assistant. Examples extracted from https://www.w3.org/TR/webvtt1/ Copyright © 2019 World Wide Web Consortium. """ From 00a355d81f8b26f0ee9ea2cbce0831c72709f062 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 23 Jan 2026 16:28:56 +0100 Subject: [PATCH 18/20] tests(webvtt): fix test with STYLE and NOTE blocks Signed-off-by: Cesar Berrospi Ramis --- test/test_serialization_doctag.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py index 9b378b03..86237a9a 100644 --- a/test/test_serialization_doctag.py +++ b/test/test_serialization_doctag.py @@ -6,8 +6,7 @@ DocTagsDocSerializer, DocTagsParams, ) -from docling_core.types.doc import DoclingDocument, TableData -from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc import DocItemLabel, DoclingDocument, TableData from .test_serialization import verify From 818fc626cb5cab7a7ce6c2bd29125c472702a362 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 23 Jan 2026 19:04:04 +0100 Subject: [PATCH 19/20] style(webvtt): apply X | Y annotation instead of Optional, Union Signed-off-by: Cesar Berrospi Ramis --- docling_core/transforms/serializer/webvtt.py | 26 +++++++++--------- docling_core/types/doc/webvtt.py | 28 +++++++++----------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py index bfd1fd55..bbc6e344 100644 --- a/docling_core/transforms/serializer/webvtt.py +++ b/docling_core/transforms/serializer/webvtt.py @@ -2,7 +2,7 @@ import logging import re -from typing import Any, Optional, get_args +from typing import Any, get_args from pydantic import BaseModel from typing_extensions import override @@ -132,7 +132,7 @@ def serialize( doc_serializer: BaseDocSerializer, doc: DoclingDocument, is_inline_scope: bool = False, - visited: Optional[set[str]] = None, + visited: set[str] | None = None, **kwargs: Any, ) -> SerializationResult: """Serializes the passed item.""" @@ -158,7 +158,7 @@ def serialize( ) if is_inline_scope: # Iteratively remove unnecessary consecutive tag pairs until no more changes - prev_text: Optional[str] = None + prev_text: str | None = None while prev_text != text: prev_text = text text = _remove_consecutive_pairs(text) @@ -275,7 +275,7 @@ def serialize( doc_serializer: "BaseDocSerializer", doc: DoclingDocument, list_level: int = 0, - visited: Optional[set[str]] = None, + visited: set[str] | None = None, **kwargs: Any, ) -> SerializationResult: """Serializes an inline group to WebVTT format.""" @@ -343,7 +343,7 @@ class WebVTTDocSerializer(DocSerializer): fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer() list_serializer: BaseListSerializer = _WebVTTListSerializer() inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer() - meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer() + meta_serializer: BaseMetaSerializer | None = _WebVTTMetaSerializer() annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer() params: CommonParams = CommonParams() @@ -393,7 +393,7 @@ def serialize_cue_span( self, text: str, tag: START_TAG_NAMES, - anno: Optional[str] = None, + anno: str | None = None, css: list[str] = [], ) -> str: """Apply serialization to a WebVTT cue span.""" @@ -442,10 +442,10 @@ def serialize_doc( **kwargs: Any, ) -> SerializationResult: """Serialize a document out of its parts.""" - title: Optional[str] = None + title: str | None = None - timings: Optional[WebVTTCueTimings] = None - id: Optional[str] = None + timings: WebVTTCueTimings | None = None + id: str | None = None text: str = "" cue_blocks: list[WebVTTCueBlock] = [] for part in parts: @@ -503,10 +503,10 @@ def serialize_doc( def post_process( self, text: str, - formatting: Optional[Formatting] = None, - voice: Optional[str] = None, - languages: Optional[list[str]] = None, - classes: Optional[list[str]] = None, + formatting: Formatting | None = None, + voice: str | None = None, + languages: list[str] | None = None, + classes: list[str] | None = None, **kwargs: Any, ) -> str: """Apply some text post-processing steps by adding formatting tags. diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index 297e97fb..f7c4eea6 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -5,7 +5,7 @@ from collections.abc import Iterator from enum import Enum from functools import total_ordering -from typing import Annotated, ClassVar, Literal, Optional, Union +from typing import Annotated, ClassVar, Literal from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic.types import StringConstraints @@ -204,7 +204,7 @@ class WebVTTCueComponentWithTerminator(BaseModel): """WebVTT caption or subtitle cue component optionally with a line terminator.""" component: "WebVTTCueComponent" - terminator: Optional[WebVTTLineTerminator] = None + terminator: WebVTTLineTerminator | None = None @override def __str__(self) -> str: @@ -215,7 +215,7 @@ def __str__(self) -> str: class WebVTTCueInternalText(BaseModel): """WebVTT cue internal text.""" - terminator: Optional[WebVTTLineTerminator] = None + terminator: WebVTTLineTerminator | None = None components: Annotated[ list[WebVTTCueComponentWithTerminator], Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")), @@ -380,15 +380,13 @@ class WebVTTCueLanguageSpan(WebVTTCueComponentBase): WebVTTCueComponent = Annotated[ - Union[ - WebVTTCueTextSpan, - WebVTTCueClassSpan, - WebVTTCueItalicSpan, - WebVTTCueBoldSpan, - WebVTTCueUnderlineSpan, - WebVTTCueVoiceSpan, - WebVTTCueLanguageSpan, - ], + WebVTTCueTextSpan + | WebVTTCueClassSpan + | WebVTTCueItalicSpan + | WebVTTCueBoldSpan + | WebVTTCueUnderlineSpan + | WebVTTCueVoiceSpan + | WebVTTCueLanguageSpan, Field( discriminator="kind", description="The type of WebVTT caption or subtitle cue component.", @@ -406,7 +404,7 @@ class WebVTTCueBlock(BaseModel): model_config = ConfigDict(regex_engine="python-re") - identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None + identifier: Annotated[WebVTTCueIdentifier | None, Field(description="The WebVTT cue identifier")] = None timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")] payload: Annotated[ list[WebVTTCueComponentWithTerminator], @@ -456,7 +454,7 @@ def parse(cls, raw: str) -> Self: lines = raw.strip().splitlines() if not lines: raise ValueError("Cue block must have at least one line") - identifier: Optional[WebVTTCueIdentifier] = None + identifier: WebVTTCueIdentifier | None = None timing_line = lines[0] if "-->" not in timing_line and len(lines) > 1: identifier = timing_line @@ -585,7 +583,7 @@ class WebVTTFile(BaseModel): _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)") cue_blocks: list[WebVTTCueBlock] - title: Optional[str] = None + title: str | None = None @staticmethod def verify_signature(content: str) -> bool: From 55fb835f06d0e3578c7527c825e6adb57cbeb37c Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Mon, 26 Jan 2026 00:10:56 +0100 Subject: [PATCH 20/20] refactor(webvtt): simplify TrackProvenance model with tags Signed-off-by: Cesar Berrospi Ramis --- docling_core/transforms/serializer/webvtt.py | 47 +++++--- docling_core/types/doc/document.py | 39 +++--- docling_core/types/doc/webvtt.py | 12 +- docs/DoclingDocument.json | 118 ++++++++++++++----- test/data/doc/webvtt_example_01.json | 91 ++++++++++++-- test/data/doc/webvtt_example_02.json | 53 +++++++-- test/data/doc/webvtt_example_03.json | 112 +++++++++++++++--- test/data/doc/webvtt_example_04.json | 16 ++- test/data/doc/webvtt_example_05.json | 23 +++- test/test_doc_base.py | 32 +++-- 10 files changed, 407 insertions(+), 136 deletions(-) diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py index bbc6e344..eba06b36 100644 --- a/docling_core/transforms/serializer/webvtt.py +++ b/docling_core/transforms/serializer/webvtt.py @@ -152,9 +152,7 @@ def serialize( text: str = doc_serializer.post_process( text=item.text, formatting=item.formatting, - voice=prov.voice, - languages=prov.languages, - classes=prov.classes, + tags=prov.tags, ) if is_inline_scope: # Iteratively remove unnecessary consecutive tag pairs until no more changes @@ -394,7 +392,7 @@ def serialize_cue_span( text: str, tag: START_TAG_NAMES, anno: str | None = None, - css: list[str] = [], + css: list[str] | None = None, ) -> str: """Apply serialization to a WebVTT cue span.""" start_tag: WebVTTCueSpanStartTag @@ -504,9 +502,7 @@ def post_process( self, text: str, formatting: Formatting | None = None, - voice: str | None = None, - languages: list[str] | None = None, - classes: list[str] | None = None, + tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None, **kwargs: Any, ) -> str: """Apply some text post-processing steps by adding formatting tags. @@ -521,25 +517,40 @@ def post_process( 6. voice () """ res: str = text - cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {} - - for lang in languages or []: - res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", [])) - - res = super().post_process(text=res, formatting=formatting, classes=cls) - - if "c" in cls: + # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {} + + languages: list[WebVTTCueSpanStartTagAnnotated] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang" + ] + for lang in languages: + res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes) + + format_classes = { + item.name: item.classes + for item in tags or [] + if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"} + } + res = super().post_process(text=res, formatting=formatting, classes=format_classes) + + class_tag: list[WebVTTCueSpanStartTag] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c" + ] + if class_tag: res = self.serialize_cue_span( text=res, tag="c", - css=cls.get("c", []), + css=class_tag[0].classes, ) + + voice: list[WebVTTCueSpanStartTagAnnotated] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v" + ] if voice: res = self.serialize_cue_span( text=res, tag="v", - anno=voice, - css=cls.get("v", []), + anno=voice[0].annotation, + css=voice[0].classes, ) return res diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index e864574f..a9dd4aa8 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -65,6 +65,7 @@ ) from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import parse_otsl_table_content, relative_path +from docling_core.types.doc.webvtt import WebVTTCueIdentifier, WebVTTCueSpanStartTag, WebVTTCueSpanStartTagAnnotated _logger = logging.getLogger(__name__) @@ -1187,8 +1188,8 @@ class TrackProvenance(BaseProvenance): block, an audio clip, or a timed marker in a screen-recording. """ + model_config = ConfigDict(regex_engine="python-re") kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track" - start_time: Annotated[ float, Field( @@ -1203,27 +1204,19 @@ class TrackProvenance(BaseProvenance): description="End time offset of the track cue in seconds", ), ] - identifier: Optional[str] = Field( - None, - examples=["test", "123", "b72d946"], - description="An identifier of the cue", - ) - voice: Optional[str] = Field( - None, - examples=["Mary", "Fred", "Name Surname"], - description="The cue voice (speaker)", - ) - languages: Optional[list[str]] = Field( - None, - examples=[["en", "en-GB"], ["fr-CA"]], - description="Languages of the cue in BCP 47 language tag format", - ) - classes: Optional[list[str]] = Field( - None, - min_length=1, - examples=["b.first", "v.loud", "c.yellow"], - description="Classes for describing the cue significance", - ) + identifier: Annotated[ + WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"]) + ] = None + tags: Annotated[ + list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None, + Field( + description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).", + examples=[ + [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")], + [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])], + ], + ), + ] = None @model_validator(mode="after") def check_order(self) -> Self: @@ -1406,7 +1399,7 @@ class PictureMeta(FloatingMeta): tabular_chart: Optional[TabularChartMetaField] = None -class NodeItem(BaseModel): +class NodeItem(BaseModel, validate_assignment=True): """NodeItem.""" self_ref: str = Field(pattern=_JSON_POINTER_REGEX) diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py index f7c4eea6..32bfc12d 100644 --- a/docling_core/types/doc/webvtt.py +++ b/docling_core/types/doc/webvtt.py @@ -233,19 +233,19 @@ class WebVTTCueSpanStartTag(BaseModel): name: Annotated[START_TAG_NAMES, Field(description="The tag name")] classes: Annotated[ - list[str], + list[str] | None, Field(description="List of classes representing the cue span's significance"), - ] = [] + ] = None @field_validator("classes", mode="after") @classmethod - def validate_classes(cls, value: list[str]) -> list[str]: + def validate_classes(cls, value: list[str] | None) -> list[str] | None: """Validate cue span start tag classes.""" - for item in value: + for item in value or []: if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): raise ValueError("A cue span start tag class contains invalid characters") if not item: - raise ValueError("Cue span start tag classes cannot be empty") + raise ValueError("A cue span start tag class cannot be empty") return value def _get_name_with_classes(self) -> str: @@ -501,7 +501,7 @@ def parse(cls, raw: str) -> Self: raise ValueError(f"Incorrect end tag: {ct}") class_string = closed["class"] annotation = closed["annotation"] - classes: list[str] = [] + classes: list[str] | None = None if class_string: classes = [c for c in class_string.split(".") if c] st: WebVTTCueSpanStartTag diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index b37260eb..6b617f28 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -3224,6 +3224,7 @@ "identifier": { "anyOf": [ { + "pattern": "^(?!.*-->)[^\\n\\r]+$", "type": "string" }, { @@ -3239,25 +3240,73 @@ ], "title": "Identifier" }, - "voice": { + "tags": { "anyOf": [ { - "type": "string" + "items": { + "anyOf": [ + { + "$ref": "#/$defs/WebVTTCueSpanStartTag" + }, + { + "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated" + } + ] + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "The cue voice (speaker)", + "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).", "examples": [ - "Mary", - "Fred", - "Name Surname" + [ + { + "annotation": "John", + "classes": [ + "loud" + ], + "name": "v" + } + ], + [ + { + "classes": [ + "foreignphrase" + ], + "name": "i" + } + ] + ], + "title": "Tags" + } + }, + "required": [ + "start_time", + "end_time" + ], + "title": "TrackProvenance", + "type": "object" + }, + "WebVTTCueSpanStartTag": { + "description": "WebVTT cue span start tag.", + "properties": { + "name": { + "description": "The tag name", + "enum": [ + "c", + "b", + "i", + "u", + "v", + "lang" ], - "title": "Voice" + "title": "Name", + "type": "string" }, - "languages": { + "classes": { "anyOf": [ { "items": { @@ -3270,17 +3319,31 @@ } ], "default": null, - "description": "Languages of the cue in BCP 47 language tag format", - "examples": [ - [ - "en", - "en-GB" - ], - [ - "fr-CA" - ] + "description": "List of classes representing the cue span's significance", + "title": "Classes" + } + }, + "required": [ + "name" + ], + "title": "WebVTTCueSpanStartTag", + "type": "object" + }, + "WebVTTCueSpanStartTagAnnotated": { + "description": "WebVTT cue span start tag requiring an annotation.", + "properties": { + "name": { + "description": "The tag name", + "enum": [ + "c", + "b", + "i", + "u", + "v", + "lang" ], - "title": "Languages" + "title": "Name", + "type": "string" }, "classes": { "anyOf": [ @@ -3288,7 +3351,6 @@ "items": { "type": "string" }, - "minItems": 1, "type": "array" }, { @@ -3296,20 +3358,20 @@ } ], "default": null, - "description": "Classes for describing the cue significance", - "examples": [ - "b.first", - "v.loud", - "c.yellow" - ], + "description": "List of classes representing the cue span's significance", "title": "Classes" + }, + "annotation": { + "description": "Cue span start tag annotation", + "title": "Annotation", + "type": "string" } }, "required": [ - "start_time", - "end_time" + "name", + "annotation" ], - "title": "TrackProvenance", + "title": "WebVTTCueSpanStartTagAnnotated", "type": "object" } }, diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json index 78ce13b6..85d119be 100644 --- a/test/data/doc/webvtt_example_01.json +++ b/test/data/doc/webvtt_example_01.json @@ -76,7 +76,12 @@ "kind": "track", "start_time": 11.0, "end_time": 13.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "We are in New York City", @@ -95,7 +100,12 @@ "kind": "track", "start_time": 13.0, "end_time": 16.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "We’re actually at the Lucern Hotel, just down the street", @@ -114,7 +124,12 @@ "kind": "track", "start_time": 16.0, "end_time": 18.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "from the American Museum of Natural History", @@ -133,7 +148,12 @@ "kind": "track", "start_time": 18.0, "end_time": 20.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "And with me is Neil deGrasse Tyson", @@ -152,7 +172,12 @@ "kind": "track", "start_time": 20.0, "end_time": 22.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "Astrophysicist, Director of the Hayden Planetarium", @@ -171,7 +196,12 @@ "kind": "track", "start_time": 22.0, "end_time": 24.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "at the AMNH.", @@ -190,7 +220,12 @@ "kind": "track", "start_time": 24.0, "end_time": 26.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "Thank you for walking down here.", @@ -209,7 +244,12 @@ "kind": "track", "start_time": 27.0, "end_time": 30.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "And I want to do a follow-up on the last conversation we did.", @@ -228,7 +268,12 @@ "kind": "track", "start_time": 30.0, "end_time": 31.5, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "When we e-mailed—", @@ -247,7 +292,12 @@ "kind": "track", "start_time": 30.5, "end_time": 32.5, - "voice": "Neil deGrasse Tyson" + "tags": [ + { + "name": "v", + "annotation": "Neil deGrasse Tyson" + } + ] } ], "orig": "Didn’t we talk about enough in that conversation?", @@ -266,7 +316,12 @@ "kind": "track", "start_time": 32.0, "end_time": 35.5, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "No! No no no no; 'cos 'cos obviously 'cos", @@ -285,7 +340,12 @@ "kind": "track", "start_time": 32.5, "end_time": 33.5, - "voice": "Neil deGrasse Tyson" + "tags": [ + { + "name": "v", + "annotation": "Neil deGrasse Tyson" + } + ] } ], "orig": "Laughs", @@ -311,7 +371,12 @@ "kind": "track", "start_time": 35.5, "end_time": 38.0, - "voice": "Roger Bingham" + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] } ], "orig": "You know I’m so excited my glasses are falling off here.", diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json index 35c53692..55fd15ea 100644 --- a/test/data/doc/webvtt_example_02.json +++ b/test/data/doc/webvtt_example_02.json @@ -93,9 +93,15 @@ "kind": "track", "start_time": 0.0, "end_time": 2.0, - "voice": "Esme", - "classes": [ - "v.first.loud" + "tags": [ + { + "name": "v", + "annotation": "Esme", + "classes": [ + "first", + "loud" + ] + } ] } ], @@ -115,7 +121,12 @@ "kind": "track", "start_time": 2.0, "end_time": 4.0, - "voice": "Mary" + "tags": [ + { + "name": "v", + "annotation": "Mary" + } + ] } ], "orig": "No way!", @@ -134,7 +145,12 @@ "kind": "track", "start_time": 4.0, "end_time": 6.0, - "voice": "Esme" + "tags": [ + { + "name": "v", + "annotation": "Esme" + } + ] } ], "orig": "Hee!", @@ -196,9 +212,14 @@ "kind": "track", "start_time": 6.0, "end_time": 8.0, - "voice": "Mary", - "classes": [ - "v.loud" + "tags": [ + { + "name": "v", + "annotation": "Mary", + "classes": [ + "loud" + ] + } ] } ], @@ -236,11 +257,17 @@ "kind": "track", "start_time": 8.0, "end_time": 10.0, - "languages": [ - "en" - ], - "classes": [ - "i.foreignphrase" + "tags": [ + { + "name": "lang", + "annotation": "en" + }, + { + "name": "i", + "classes": [ + "foreignphrase" + ] + } ] } ], diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json index 42d9e5b2..7b6faa6c 100644 --- a/test/data/doc/webvtt_example_03.json +++ b/test/data/doc/webvtt_example_03.json @@ -89,7 +89,12 @@ "start_time": 4.963, "end_time": 8.571, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "OK,", @@ -109,7 +114,12 @@ "start_time": 4.963, "end_time": 8.571, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "I think now we should be recording", @@ -129,7 +139,12 @@ "start_time": 8.571, "end_time": 9.403, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "properly.", @@ -168,7 +183,12 @@ "start_time": 13.363, "end_time": 13.803, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "Yeah.", @@ -188,7 +208,12 @@ "start_time": 49.603, "end_time": 53.363, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "I was also thinking.", @@ -208,7 +233,12 @@ "start_time": 54.963, "end_time": 62.072, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "Would be maybe good to create items,", @@ -228,7 +258,12 @@ "start_time": 62.072, "end_time": 66.811, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "some metadata,", @@ -248,7 +283,12 @@ "start_time": 62.072, "end_time": 66.811, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "some options that can be specific.", @@ -268,7 +308,12 @@ "start_time": 70.243, "end_time": 73.014, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "Yeah,", @@ -288,7 +333,12 @@ "start_time": 70.243, "end_time": 73.014, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "I mean I think you went even more than", @@ -308,7 +358,12 @@ "start_time": 70.563, "end_time": 72.643, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "But we preserved the atoms.", @@ -328,7 +383,12 @@ "start_time": 73.014, "end_time": 75.907, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "than me.", @@ -348,7 +408,12 @@ "start_time": 73.014, "end_time": 75.907, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "I just opened the format.", @@ -368,7 +433,12 @@ "start_time": 110.222, "end_time": 111.643, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "give it a try, yeah.", @@ -388,7 +458,12 @@ "start_time": 112.043, "end_time": 115.043, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", - "voice": "Speaker B" + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] } ], "orig": "Okay, talk to you later.", @@ -408,7 +483,12 @@ "start_time": 114.603, "end_time": 115.283, "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", - "voice": "Speaker A" + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] } ], "orig": "See you.", diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json index 7e12385d..98e7da21 100644 --- a/test/data/doc/webvtt_example_04.json +++ b/test/data/doc/webvtt_example_04.json @@ -139,8 +139,13 @@ "kind": "track", "start_time": 5.0, "end_time": 9.0, - "classes": [ - "b.loud" + "tags": [ + { + "name": "b", + "classes": [ + "loud" + ] + } ] } ], @@ -185,7 +190,12 @@ "kind": "track", "start_time": 5.0, "end_time": 9.0, - "voice": "John" + "tags": [ + { + "name": "v", + "annotation": "John" + } + ] } ], "orig": "This is true.", diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json index 9a53b3b0..4af18174 100644 --- a/test/data/doc/webvtt_example_05.json +++ b/test/data/doc/webvtt_example_05.json @@ -164,8 +164,11 @@ "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", - "languages": [ - "es-ES" + "tags": [ + { + "name": "lang", + "annotation": "es-ES" + } ] } ], @@ -231,8 +234,13 @@ "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", - "classes": [ - "b.loud" + "tags": [ + { + "name": "b", + "classes": [ + "loud" + ] + } ] } ], @@ -286,8 +294,11 @@ "start_time": 14760.0, "end_time": 14818.239, "identifier": "agcvs-08234", - "languages": [ - "it" + "tags": [ + { + "name": "lang", + "annotation": "it" + } ] } ], diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 45a9445c..5d569716 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -1,7 +1,7 @@ import pytest from pydantic import ValidationError -from docling_core.types.doc import TrackProvenance +from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance from docling_core.types.legacy_doc.base import Prov, S3Reference @@ -40,34 +40,40 @@ def test_prov(): Prov(**prov) -def test_prov_track(): +def test_track_provenance(): """Test the class TrackProvenance.""" valid_track = TrackProvenance( start_time=11.0, end_time=12.0, identifier="test", - voice="Mary", - languages=["en", "en-GB"], - classes=["v.first.loud", "i.foreignphrase"], + tags = [ + {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]}, + {"name": "lang", "annotation": "en"}, + {"name": "lang", "annotation": "en-GB"}, + {"name": "i", "classes": ["foreignphrase"]}, + ] ) assert valid_track assert valid_track.start_time == 11.0 assert valid_track.end_time == 12.0 assert valid_track.identifier == "test" - assert valid_track.voice == "Mary" - assert valid_track.languages == ["en", "en-GB"] - assert valid_track.classes == ["v.first.loud", "i.foreignphrase"] + assert valid_track.tags + assert valid_track.tags[0].annotation == "Mary" + assert valid_track.tags[0].classes == ["first", "loud"] + assert valid_track.tags[1].annotation == "en" + assert valid_track.tags[2].annotation == "en-GB" + assert valid_track.tags[3].classes == ["foreignphrase"] with pytest.raises(ValidationError, match="end_time"): TrackProvenance(start_time=11.0) - with pytest.raises(ValidationError, match="should be a valid list"): + with pytest.raises(ValidationError, match="should be a valid dictionary"): TrackProvenance( start_time=11.0, end_time=12.0, - languages="en", + tags=["en"], ) with pytest.raises(ValidationError, match="must be greater than start"): @@ -75,3 +81,9 @@ def test_prov_track(): start_time=11.0, end_time=11.0, ) + + doc = DoclingDocument(name="Unknown") + item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT) + item.source = [valid_track] + with pytest.raises(ValidationError, match="should be a valid list"): + item.source = "Invalid source"