From db920efb87f543265f840287a899241f0439567d Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 14 Nov 2025 11:41:09 +0100
Subject: [PATCH 01/20] refactor: move WebVTT data model from docling

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 416 +++++++++++++++++++++++++
 test/data/webvtt/webvtt_example_01.vtt |  42 +++
 test/data/webvtt/webvtt_example_02.vtt |  15 +
 test/data/webvtt/webvtt_example_03.vtt |  57 ++++
 test/test_webvtt.py                    | 199 ++++++++++++
 5 files changed, 729 insertions(+)
 create mode 100644 docling_core/types/doc/webvtt.py
 create mode 100644 test/data/webvtt/webvtt_example_01.vtt
 create mode 100644 test/data/webvtt/webvtt_example_02.vtt
 create mode 100644 test/data/webvtt/webvtt_example_03.vtt
 create mode 100644 test/test_webvtt.py

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
new file mode 100644
index 00000000..eccae4a6
--- /dev/null
+++ b/docling_core/types/doc/webvtt.py
@@ -0,0 +1,416 @@
+"""Models for the Docling's adoption of Web Video Text Tracks format."""
+
+import logging
+import re
+from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic.types import StringConstraints
+from typing_extensions import Self, override
+
+_log = logging.getLogger(__name__)
+
+
+class _WebVTTTimestamp(BaseModel):
+    """Model representing a WebVTT timestamp.
+
+    A WebVTT timestamp is always interpreted relative to the current playback position
+    of the media data that the WebVTT file is to be synchronized with.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    raw: Annotated[
+        str,
+        Field(
+            description="A representation of the WebVTT Timestamp as a single string"
+        ),
+    ]
+
+    _pattern: ClassVar[re.Pattern] = re.compile(
+        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
+    )
+    _hours: int
+    _minutes: int
+    _seconds: int
+    _millis: int
+
+    @model_validator(mode="after")
+    def validate_raw(self) -> Self:
+        m = self._pattern.match(self.raw)
+        if not m:
+            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
+        self._hours = int(m.group(1)) if m.group(1) else 0
+        self._minutes = int(m.group(2))
+        self._seconds = int(m.group(3))
+        self._millis = int(m.group(4))
+
+        if self._minutes < 0 or self._minutes > 59:
+            raise ValueError("Minutes must be between 0 and 59")
+        if self._seconds < 0 or self._seconds > 59:
+            raise ValueError("Seconds must be between 0 and 59")
+
+        return self
+
+    @property
+    def seconds(self) -> float:
+        """A representation of the WebVTT Timestamp in seconds."""
+        return (
+            self._hours * 3600
+            + self._minutes * 60
+            + self._seconds
+            + self._millis / 1000.0
+        )
+
+    @override
+    def __str__(self) -> str:
+        return self.raw
+
+
+_WebVTTCueIdentifier = Annotated[
+    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
+class _WebVTTCueTimings(BaseModel):
+    """Model representating WebVTT cue timings."""
+
+    start: Annotated[
+        _WebVTTTimestamp, Field(description="Start time offset of the cue")
+    ]
+    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        if self.start and self.end:
+            if self.end.seconds <= self.start.seconds:
+                raise ValueError("End timestamp must be greater than start timestamp")
+        return self
+
+    @override
+    def __str__(self):
+        return f"{self.start} --> {self.end}"
+
+
+class _WebVTTCueTextSpan(BaseModel):
+    """Model representing a WebVTT cue text span."""
+
+    text: str
+    span_type: Literal["text"] = "text"
+
+    @field_validator("text", mode="after")
+    @classmethod
+    def validate_text(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+            raise ValueError("Cue text span contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        return self.text
+
+
+class _WebVTTCueVoiceSpan(BaseModel):
+    """Model representing a WebVTT cue voice span."""
+
+    annotation: Annotated[
+        str,
+        Field(
+            description=(
+                "Cue span start tag annotation text representing the name of thevoice"
+            )
+        ),
+    ]
+    classes: Annotated[
+        list[str],
+        Field(description="List of classes representing the cue span's significance"),
+    ] = []
+    components: Annotated[
+        list["_WebVTTCueComponent"],
+        Field(description="The components representing the cue internal text"),
+    ] = []
+    span_type: Literal["v"] = "v"
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def validate_annotation(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
+            raise ValueError(
+                "Cue span start tag annotation contains invalid characters"
+            )
+        if not value:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @field_validator("classes", mode="after")
+    @classmethod
+    def validate_classes(cls, value: list[str]) -> list[str]:
+        for item in value:
+            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
+                raise ValueError(
+                    "A cue span start tag class contains invalid characters"
+                )
+            if not item:
+                raise ValueError("Cue span start tag classes cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
+        inner = "".join(str(span) for span in self.components)
+        return f"<{tag} {self.annotation}>{inner}</v>"
+
+
+class _WebVTTCueClassSpan(BaseModel):
+    span_type: Literal["c"] = "c"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<c>{inner}</c>"
+
+
+class _WebVTTCueItalicSpan(BaseModel):
+    span_type: Literal["i"] = "i"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<i>{inner}</i>"
+
+
+class _WebVTTCueBoldSpan(BaseModel):
+    span_type: Literal["b"] = "b"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<b>{inner}</b>"
+
+
+class _WebVTTCueUnderlineSpan(BaseModel):
+    span_type: Literal["u"] = "u"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<u>{inner}</u>"
+
+
+_WebVTTCueComponent = Annotated[
+    Union[
+        _WebVTTCueTextSpan,
+        _WebVTTCueClassSpan,
+        _WebVTTCueItalicSpan,
+        _WebVTTCueBoldSpan,
+        _WebVTTCueUnderlineSpan,
+        _WebVTTCueVoiceSpan,
+    ],
+    Field(discriminator="span_type", description="The WebVTT cue component"),
+]
+
+
+class _WebVTTCueBlock(BaseModel):
+    """Model representing a WebVTT cue block.
+
+    The optional WebVTT cue settings list is not supported.
+    The cue payload is limited to the following spans: text, class, italic, bold,
+    underline, and voice.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    identifier: Optional[_WebVTTCueIdentifier] = Field(
+        None, description="The WebVTT cue identifier"
+    )
+    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+
+    _pattern_block: ClassVar[re.Pattern] = re.compile(
+        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
+    )
+    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
+        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
+        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
+    )
+
+    @field_validator("payload", mode="after")
+    @classmethod
+    def validate_payload(cls, payload):
+        for voice in payload:
+            if "-->" in str(voice):
+                raise ValueError("Cue payload must not contain '-->'")
+        return payload
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTCueBlock":
+        lines = raw.strip().splitlines()
+        if not lines:
+            raise ValueError("Cue block must have at least one line")
+        identifier: Optional[_WebVTTCueIdentifier] = None
+        timing_line = lines[0]
+        if "-->" not in timing_line and len(lines) > 1:
+            identifier = timing_line
+            timing_line = lines[1]
+            cue_lines = lines[2:]
+        else:
+            cue_lines = lines[1:]
+
+        if "-->" not in timing_line:
+            raise ValueError("Cue block must contain WebVTT cue timings")
+
+        start, end = [t.strip() for t in timing_line.split("-->")]
+        end = re.split(" |\t", end)[0]  # ignore the cue settings list
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(
+            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+        )
+        cue_text = " ".join(cue_lines).strip()
+        if cue_text.startswith("<v") and "</v>" not in cue_text:
+            # adding close tag for cue voice spans without end tag
+            cue_text += "</v>"
+
+        stack: list[list[_WebVTTCueComponent]] = [[]]
+        tag_stack: list[Union[str, tuple]] = []
+
+        pos = 0
+        matches = list(cls._pattern_block.finditer(cue_text))
+        i = 0
+        while i < len(matches):
+            match = matches[i]
+            if match.start() > pos:
+                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+            tag = match.group(0)
+
+            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
+                tag_type = tag[1:2]
+                tag_stack.append(tag_type)
+                stack.append([])
+            elif tag == "</i>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueItalicSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</b>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueBoldSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</u>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</c>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueClassSpan(components=children))
+                tag_stack.pop()
+            elif tag.startswith("<v"):
+                tag_stack.append(("v", tag))
+                stack.append([])
+            elif tag.startswith("</v"):
+                children = stack.pop() if stack else []
+                if (
+                    tag_stack
+                    and isinstance(tag_stack[-1], tuple)
+                    and tag_stack[-1][0] == "v"
+                ):
+                    _, voice = cast(tuple, tag_stack.pop())
+                    voice_match = cls._pattern_voice_tag.match(voice)
+                    if voice_match:
+                        class_string = voice_match.group("class")
+                        annotation = voice_match.group("annotation")
+                        if annotation:
+                            classes: list[str] = []
+                            if class_string:
+                                classes = [c for c in class_string.split(".") if c]
+                            stack[-1].append(
+                                _WebVTTCueVoiceSpan(
+                                    annotation=annotation.strip(),
+                                    classes=classes,
+                                    components=children,
+                                )
+                            )
+
+            pos = match.end()
+            i += 1
+
+        if pos < len(cue_text):
+            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
+
+        return cls(
+            identifier=identifier,
+            timings=timings,
+            payload=stack[0],
+        )
+
+    def __str__(self):
+        parts = []
+        if self.identifier:
+            parts.append(f"{self.identifier}\n")
+        timings_line = str(self.timings)
+        parts.append(timings_line + "\n")
+        for idx, span in enumerate(self.payload):
+            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
+                # the end tag may be omitted for brevity
+                parts.append(str(span).removesuffix("</v>"))
+            else:
+                parts.append(str(span))
+
+        return "".join(parts)
+
+
+class _WebVTTFile(BaseModel):
+    """A model representing a WebVTT file."""
+
+    cue_blocks: list[_WebVTTCueBlock]
+
+    @staticmethod
+    def verify_signature(content: str) -> bool:
+        if not content:
+            return False
+        elif len(content) == 6:
+            return content == "WEBVTT"
+        elif len(content) > 6 and content.startswith("WEBVTT"):
+            return content[6] in (" ", "\t", "\n")
+        else:
+            return False
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTFile":
+        # Normalize newlines to LF
+        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+
+        # Check WebVTT signature
+        if not cls.verify_signature(raw):
+            raise ValueError("Invalid WebVTT file signature")
+
+        # Strip "WEBVTT" header line
+        lines = raw.split("\n", 1)
+        body = lines[1] if len(lines) > 1 else ""
+
+        # Remove NOTE/STYLE/REGION blocks
+        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
+        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+
+        # Split into cue blocks
+        raw_blocks = re.split(r"\n\s*\n", body.strip())
+        cues: list[_WebVTTCueBlock] = []
+        for block in raw_blocks:
+            try:
+                cues.append(_WebVTTCueBlock.parse(block))
+            except ValueError as e:
+                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+
+        return cls(cue_blocks=cues)
+
+    def __iter__(self):
+        return iter(self.cue_blocks)
+
+    def __getitem__(self, idx):
+        return self.cue_blocks[idx]
+
+    def __len__(self):
+        return len(self.cue_blocks)
diff --git a/test/data/webvtt/webvtt_example_01.vtt b/test/data/webvtt/webvtt_example_01.vtt
new file mode 100644
index 00000000..333ca4a8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+<v Roger Bingham>We are in New York City
+
+00:13.000 --> 00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+<v Roger Bingham>from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+<v Roger Bingham>at the AMNH.
+
+00:24.000 --> 00:26.000
+<v Roger Bingham>Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+<v Roger Bingham>When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+<v Neil deGrasse Tyson><i>Laughs</i>
+
+00:35.500 --> 00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.
diff --git a/test/data/webvtt/webvtt_example_02.vtt b/test/data/webvtt/webvtt_example_02.vtt
new file mode 100644
index 00000000..1152a1e8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+<v.first.loud Esme>It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+<v Mary>No way!
+
+00:04.000 --> 00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:06.000 --> 00:08.000
+<v.loud Mary>That’s awesome!
\ No newline at end of file
diff --git a/test/data/webvtt/webvtt_example_03.vtt b/test/data/webvtt/webvtt_example_03.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
new file mode 100644
index 00000000..75f5dfc1
--- /dev/null
+++ b/test/test_webvtt.py
@@ -0,0 +1,199 @@
+# Assisted by watsonx Code Assistant
+
+
+import pytest
+from pydantic import ValidationError
+
+from docling_core.types.doc.webvtt import (
+    _WebVTTCueItalicSpan,
+    _WebVTTCueTextSpan,
+    _WebVTTCueTimings,
+    _WebVTTCueVoiceSpan,
+    _WebVTTFile,
+    _WebVTTTimestamp,
+)
+
+from .test_data_gen_flag import GEN_TEST_DATA
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents():
+    """Test WebVTT components."""
+    valid_timestamps = [
+        "00:01:02.345",
+        "12:34:56.789",
+        "02:34.567",
+        "00:00:00.000",
+    ]
+    valid_total_seconds = [
+        1 * 60 + 2.345,
+        12 * 3600 + 34 * 60 + 56.789,
+        2 * 60 + 34.567,
+        0.0,
+    ]
+    for idx, ts in enumerate(valid_timestamps):
+        model = _WebVTTTimestamp(raw=ts)
+        assert model.seconds == valid_total_seconds[idx]
+
+    """Test invalid WebVTT timestamps."""
+    invalid_timestamps = [
+        "00:60:02.345",  # minutes > 59
+        "00:01:60.345",  # seconds > 59
+        "00:01:02.1000",  # milliseconds > 999
+        "01:02:03",  # missing milliseconds
+        "01:02",  # missing milliseconds
+        ":01:02.345",  # extra : for missing hours
+        "abc:01:02.345",  # invalid format
+    ]
+    for ts in invalid_timestamps:
+        with pytest.raises(ValidationError):
+            _WebVTTTimestamp(raw=ts)
+
+    """Test the timestamp __str__ method."""
+    model = _WebVTTTimestamp(raw="00:01:02.345")
+    assert str(model) == "00:01:02.345"
+
+    """Test valid cue timings."""
+    start = _WebVTTTimestamp(raw="00:10.005")
+    end = _WebVTTTimestamp(raw="00:14.007")
+    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    assert cue_timings.start == start
+    assert cue_timings.end == end
+    assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+    """Test invalid cue timings with end timestamp before start."""
+    start = _WebVTTTimestamp(raw="00:10.700")
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start, end=end)
+    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+    """Test invalid cue timings with missing end."""
+    start = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start)
+    assert "Field required" in str(excinfo.value)
+
+    """Test invalid cue timings with missing start."""
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(end=end)
+    assert "Field required" in str(excinfo.value)
+
+    """Test with valid text."""
+    valid_text = "This is a valid cue text span."
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+    assert str(span) == valid_text
+
+    """Test with text containing newline characters."""
+    invalid_text = "This cue text span\ncontains a newline."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing ampersand."""
+    invalid_text = "This cue text span contains &."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing less-than sign."""
+    invalid_text = "This cue text span contains <."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with empty text."""
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text="")
+
+    """Test that annotation validation works correctly."""
+    valid_annotation = "valid-annotation"
+    invalid_annotation = "invalid\nannotation"
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
+    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+
+    """Test that classes validation works correctly."""
+    annotation = "speaker name"
+    valid_classes = ["class1", "class2"]
+    invalid_classes = ["class\nwith\nnewlines", ""]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+
+    """Test that components validation works correctly."""
+    annotation = "speaker name"
+    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    invalid_components = [123, "not a component"]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+
+    """Test valid cue voice spans."""
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        classes=["loud", "clear"],
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+
+    expected_str = "<v.loud.clear speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+    expected_str = "<v speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+
+def test_webvtt_file():
+    """Test WebVTT files."""
+    with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    block = vtt.cue_blocks[11]
+    assert str(block.timings) == "00:32.500 --> 00:33.500"
+    assert len(block.payload) == 1
+    cue_span = block.payload[0]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert cue_span.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.classes
+    assert len(cue_span.components) == 1
+    comp = cue_span.components[0]
+    assert isinstance(comp, _WebVTTCueItalicSpan)
+    assert len(comp.components) == 1
+    comp2 = comp.components[0]
+    assert isinstance(comp2, _WebVTTCueTextSpan)
+    assert comp2.text == "Laughs"
+
+    with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 4
+    reverse = (
+        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+        "https://www.w3.org/TR/webvtt1/\n\n"
+    )
+    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse
+
+    with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    for block in vtt:
+        assert block.identifier
+    block = vtt.cue_blocks[0]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    block = vtt.cue_blocks[2]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
+    assert block.payload[0].text == "Good."

From 2e9663e1ef8abdbcb70496b40cafa6218d164d25 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 14 Nov 2025 14:53:05 +0100
Subject: [PATCH 02/20] fix(webvtt): deal with HTML entities in cue text spans

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 15 ++++++++++++++-
 test/test_webvtt.py              |  6 ++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index eccae4a6..d7cabdc3 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -98,10 +98,23 @@ class _WebVTTCueTextSpan(BaseModel):
     text: str
     span_type: Literal["text"] = "text"
 
+    _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
+    _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);")
+
     @field_validator("text", mode="after")
     @classmethod
     def validate_text(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+        for match in cls._entity_pattern.finditer(value):
+            entity = match.group(1)
+            if entity not in cls._valid_entities:
+                raise ValueError(
+                    f"Cue text span contains an invalid HTML entity: &{entity};"
+                )
+        if "&" in re.sub(cls._entity_pattern, "", value):
+            raise ValueError(
+                "Found '&' not part of a valid entity in the cue text span"
+            )
+        if any(ch in value for ch in {"\n", "\r", "<"}):
             raise ValueError("Cue text span contains invalid characters")
         if len(value) == 0:
             raise ValueError("Cue text span cannot be empty")
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 75f5dfc1..ea4f2889 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -96,6 +96,12 @@ def test_vtt_cue_commponents():
     invalid_text = "This cue text span contains &."
     with pytest.raises(ValidationError):
         _WebVTTCueTextSpan(text=invalid_text)
+    invalid_text = "An invalid &foo; entity"
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+    valid_text = "My favorite book is Pride &amp; Prejudice"
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
 
     """Test with text containing less-than sign."""
     invalid_text = "This cue text span contains <."

From ea303dbe46f437274d984e4d06769f7734c375cb Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 17 Nov 2025 03:32:05 +0100
Subject: [PATCH 03/20] refactor(webvtt): support more WebVTT models

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 367 +++++++++++++++++++------------
 test/test_webvtt.py              | 137 +++++++++---
 2 files changed, 332 insertions(+), 172 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index d7cabdc3..6d60a2d8 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -2,7 +2,8 @@
 
 import logging
 import re
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+from enum import Enum
+from typing import Annotated, ClassVar, Literal, Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
@@ -11,8 +12,24 @@
 _log = logging.getLogger(__name__)
 
 
+_VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
+_ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
+_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
+
+
+class _WebVTTLineTerminator(str, Enum):
+    CRLF = "\r\n"
+    LF = "\n"
+    CR = "\r"
+
+
+_WebVTTCueIdentifier = Annotated[
+    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
 class _WebVTTTimestamp(BaseModel):
-    """Model representing a WebVTT timestamp.
+    """WebVTT timestamp.
 
     A WebVTT timestamp is always interpreted relative to the current playback position
     of the media data that the WebVTT file is to be synchronized with.
@@ -67,13 +84,8 @@ def __str__(self) -> str:
         return self.raw
 
 
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
 class _WebVTTCueTimings(BaseModel):
-    """Model representating WebVTT cue timings."""
+    """WebVTT cue timings."""
 
     start: Annotated[
         _WebVTTTimestamp, Field(description="Start time offset of the cue")
@@ -93,31 +105,27 @@ def __str__(self):
 
 
 class _WebVTTCueTextSpan(BaseModel):
-    """Model representing a WebVTT cue text span."""
+    """WebVTT cue text span."""
 
-    text: str
-    span_type: Literal["text"] = "text"
-
-    _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
-    _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);")
+    kind: Literal["text"] = "text"
+    text: Annotated[str, Field(description="The cue text.")]
 
     @field_validator("text", mode="after")
     @classmethod
-    def validate_text(cls, value: str) -> str:
-        for match in cls._entity_pattern.finditer(value):
+    def is_valid_text(cls, value: str) -> str:
+        for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
-            if entity not in cls._valid_entities:
+            if entity not in _VALID_ENTITIES:
                 raise ValueError(
-                    f"Cue text span contains an invalid HTML entity: &{entity};"
+                    f"Cue text contains an invalid HTML entity: &{entity};"
                 )
-        if "&" in re.sub(cls._entity_pattern, "", value):
-            raise ValueError(
-                "Found '&' not part of a valid entity in the cue text span"
-            )
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in the cue text")
         if any(ch in value for ch in {"\n", "\r", "<"}):
-            raise ValueError("Cue text span contains invalid characters")
+            raise ValueError("Cue text contains invalid characters")
         if len(value) == 0:
-            raise ValueError("Cue text span cannot be empty")
+            raise ValueError("Cue text cannot be empty")
+
         return value
 
     @override
@@ -125,37 +133,48 @@ def __str__(self):
         return self.text
 
 
-class _WebVTTCueVoiceSpan(BaseModel):
-    """Model representing a WebVTT cue voice span."""
+class _WebVTTCueComponentWithTerminator(BaseModel):
+    """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
-    annotation: Annotated[
-        str,
+    component: "_WebVTTCueComponent"
+    terminator: Optional[_WebVTTLineTerminator] = None
+
+    @override
+    def __str__(self):
+        return f"{self.component}{self.terminator.value if self.terminator else ''}"
+
+
+class _WebVTTCueInternalText(BaseModel):
+    """WebVTT cue internal text."""
+
+    terminator: Optional[_WebVTTLineTerminator] = None
+    components: Annotated[
+        list[_WebVTTCueComponentWithTerminator],
         Field(
             description=(
-                "Cue span start tag annotation text representing the name of thevoice"
+                "WebVTT caption or subtitle cue components representing the "
+                "cue internal text"
             )
         ),
-    ]
+    ] = []
+
+    @override
+    def __str__(self):
+        cue_str = (
+            f"{self.terminator.value if self.terminator else ''}"
+            f"{''.join(str(span) for span in self.components)}"
+        )
+        return cue_str
+
+
+class _WebVTTCueSpanStartTag(BaseModel):
+    """WebVTT cue span start tag."""
+
+    name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
         list[str],
         Field(description="List of classes representing the cue span's significance"),
     ] = []
-    components: Annotated[
-        list["_WebVTTCueComponent"],
-        Field(description="The components representing the cue internal text"),
-    ] = []
-    span_type: Literal["v"] = "v"
-
-    @field_validator("annotation", mode="after")
-    @classmethod
-    def validate_annotation(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
-            raise ValueError(
-                "Cue span start tag annotation contains invalid characters"
-            )
-        if not value:
-            raise ValueError("Cue text span cannot be empty")
-        return value
 
     @field_validator("classes", mode="after")
     @classmethod
@@ -169,51 +188,113 @@ def validate_classes(cls, value: list[str]) -> list[str]:
                 raise ValueError("Cue span start tag classes cannot be empty")
         return value
 
+    def _get_name_with_classes(self) -> str:
+        return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
+
     @override
     def __str__(self):
-        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
-        inner = "".join(str(span) for span in self.components)
-        return f"<{tag} {self.annotation}>{inner}</v>"
+        return f"<{self._get_name_with_classes()}>"
 
 
-class _WebVTTCueClassSpan(BaseModel):
-    span_type: Literal["c"] = "c"
-    components: list["_WebVTTCueComponent"]
+class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
+    """WebVTT cue span start tag requiring an annotation."""
 
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<c>{inner}</c>"
+    annotation: Annotated[str, Field(description="Cue span start tag annotation")]
 
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def is_valid_annotation(cls, value: str) -> str:
+        for match in _ENTITY_PATTERN.finditer(value):
+            entity = match.group(1)
+            if entity not in _VALID_ENTITIES:
+                raise ValueError(
+                    f"Annotation contains an invalid HTML entity: &{entity};"
+                )
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in annotation")
+        if any(ch in value for ch in {"\n", "\r", ">"}):
+            raise ValueError("Annotation contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Annotation cannot be empty")
 
-class _WebVTTCueItalicSpan(BaseModel):
-    span_type: Literal["i"] = "i"
-    components: list["_WebVTTCueComponent"]
+        return value
 
     @override
     def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<i>{inner}</i>"
+        return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
-class _WebVTTCueBoldSpan(BaseModel):
-    span_type: Literal["b"] = "b"
-    components: list["_WebVTTCueComponent"]
+class _WebVTTCueComponentBase(BaseModel):
+    """WebVTT caption or subtitle cue component.
 
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<b>{inner}</b>"
+    All the WebVTT caption or subtitle cue components are represented by this class
+    except the WebVTT cue text span, which requires different definitions.
+    """
 
+    kind: Literal["c", "b", "i", "u", "v", "lang"]
+    start_tag: _WebVTTCueSpanStartTag
+    internal_text: _WebVTTCueInternalText
 
-class _WebVTTCueUnderlineSpan(BaseModel):
-    span_type: Literal["u"] = "u"
-    components: list["_WebVTTCueComponent"]
+    @model_validator(mode="after")
+    def check_tag_names_match(self) -> Self:
+        if self.kind != self.start_tag.name:
+            raise ValueError("The tag name of this cue component should be {self.kind}")
+        return self
 
     @override
     def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<u>{inner}</u>"
+        return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
+
+
+class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase):
+    """WebVTT cue voice span associated with a specific voice."""
+
+    kind: Literal["v"] = "v"
+    start_tag: _WebVTTCueSpanStartTagAnnotated
+
+
+class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
+    """WebVTT cue class span.
+
+    It represents a span of text and it is used to annotate parts of the cue with
+    applicable classes without implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["c"] = "c"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c")
+
+
+class _WebVTTCueItalicSpan(_WebVTTCueComponentBase):
+    """WebVTT cue italic span representing a span of italic text."""
+
+    kind: Literal["i"] = "i"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i")
+
+
+class _WebVTTCueBoldSpan(_WebVTTCueComponentBase):
+    """WebVTT cue bold span representing a span of bold text."""
+
+    kind: Literal["b"] = "b"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b")
+
+
+class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase):
+    """WebVTT cue underline span representing a span of underline text."""
+
+    kind: Literal["u"] = "u"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u")
+
+
+class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
+    """WebVTT cue language span.
+
+    It represents a span of text and it is used to annotate parts of the cue where the
+    applicable language might be different than the surrounding text's, without
+    implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["lang"] = "lang"
+    start_tag: _WebVTTCueSpanStartTagAnnotated
 
 
 _WebVTTCueComponent = Annotated[
@@ -224,8 +305,12 @@ def __str__(self):
         _WebVTTCueBoldSpan,
         _WebVTTCueUnderlineSpan,
         _WebVTTCueVoiceSpan,
+        _WebVTTCueLanguageSpan,
     ],
-    Field(discriminator="span_type", description="The WebVTT cue component"),
+    Field(
+        discriminator="kind",
+        description="The type of WebVTT caption or subtitle cue component.",
+    ),
 ]
 
 
@@ -243,14 +328,17 @@ class _WebVTTCueBlock(BaseModel):
         None, description="The WebVTT cue identifier"
     )
     timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
-    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+    payload: Annotated[
+        list[_WebVTTCueComponentWithTerminator],
+        Field(description="The WebVTT caption or subtitle cue text"),
+    ]
 
-    _pattern_block: ClassVar[re.Pattern] = re.compile(
-        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
-    )
-    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
-        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
-        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
+    # pattern of a WebVTT cue span start/end tag
+    _pattern_tag: ClassVar[re.Pattern] = re.compile(
+        r"<(?P<end>/?)"
+        r"(?P<tag>i|b|c|u|v|lang)"
+        r"(?P<class>(?:\.[^\t\n\r &<>.]+)*)"
+        r"(?:[ \t](?P<annotation>[^\n\r&>]*))?>"
     )
 
     @field_validator("payload", mode="after")
@@ -284,74 +372,77 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
         )
         cue_text = " ".join(cue_lines).strip()
-        if cue_text.startswith("<v") and "</v>" not in cue_text:
-            # adding close tag for cue voice spans without end tag
-            cue_text += "</v>"
+        # adding close tag for cue spans without end tag
+        for omm in {"v"}:
+            if cue_text.startswith(f"<{omm}") and f"</{omm}>" not in cue_text:
+                cue_text += f"</{omm}>"
+                break
 
-        stack: list[list[_WebVTTCueComponent]] = [[]]
-        tag_stack: list[Union[str, tuple]] = []
+        stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]]
+        tag_stack: list[dict] = []
 
         pos = 0
-        matches = list(cls._pattern_block.finditer(cue_text))
+        matches = list(cls._pattern_tag.finditer(cue_text))
         i = 0
         while i < len(matches):
             match = matches[i]
             if match.start() > pos:
-                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-            tag = match.group(0)
-
-            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
-                tag_type = tag[1:2]
-                tag_stack.append(tag_type)
-                stack.append([])
-            elif tag == "</i>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueItalicSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</b>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueBoldSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</u>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</c>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueClassSpan(components=children))
-                tag_stack.pop()
-            elif tag.startswith("<v"):
-                tag_stack.append(("v", tag))
-                stack.append([])
-            elif tag.startswith("</v"):
-                children = stack.pop() if stack else []
-                if (
-                    tag_stack
-                    and isinstance(tag_stack[-1], tuple)
-                    and tag_stack[-1][0] == "v"
-                ):
-                    _, voice = cast(tuple, tag_stack.pop())
-                    voice_match = cls._pattern_voice_tag.match(voice)
-                    if voice_match:
-                        class_string = voice_match.group("class")
-                        annotation = voice_match.group("annotation")
-                        if annotation:
-                            classes: list[str] = []
-                            if class_string:
-                                classes = [c for c in class_string.split(".") if c]
-                            stack[-1].append(
-                                _WebVTTCueVoiceSpan(
-                                    annotation=annotation.strip(),
-                                    classes=classes,
-                                    components=children,
-                                )
+                stack[-1].append(
+                    _WebVTTCueComponentWithTerminator(
+                        component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])
+                    )
+                )
+            gps = {k: (v if v else None) for k, v in match.groupdict().items()}
+
+            if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}:
+                if not gps["end"]:
+                    tag_stack.append(gps)
+                    stack.append([])
+                else:
+                    children = stack.pop() if stack else []
+                    if tag_stack:
+                        closed = tag_stack.pop()
+                        if (ct := closed["tag"]) != gps["tag"]:
+                            raise ValueError(f"Incorrect end tag: {ct}")
+                        class_string = closed["class"]
+                        annotation = closed["annotation"]
+                        classes: list[str] = []
+                        if class_string:
+                            classes = [c for c in class_string.split(".") if c]
+                        st = (
+                            _WebVTTCueSpanStartTagAnnotated(
+                                name=ct, classes=classes, annotation=annotation.strip()
                             )
+                            if annotation
+                            else _WebVTTCueSpanStartTag(name=ct, classes=classes)
+                        )
+                        it = _WebVTTCueInternalText(components=children)
+                        cp: _WebVTTCueComponent
+                        if ct == "c":
+                            cp = _WebVTTCueClassSpan(start_tag=st, internal_text=it)
+                        elif ct == "b":
+                            cp = _WebVTTCueBoldSpan(start_tag=st, internal_text=it)
+                        elif ct == "i":
+                            cp = _WebVTTCueItalicSpan(start_tag=st, internal_text=it)
+                        elif ct == "u":
+                            cp = _WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
+                        elif ct == "lang":
+                            cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
+                        elif ct == "v":
+                            cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
+                        stack[-1].append(
+                            _WebVTTCueComponentWithTerminator(component=cp)
+                        )
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
+            stack[-1].append(
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text=cue_text[pos:])
+                )
+            )
 
         return cls(
             identifier=identifier,
@@ -366,13 +457,13 @@ def __str__(self):
         timings_line = str(self.timings)
         parts.append(timings_line + "\n")
         for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
+            if idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
                 # the end tag may be omitted for brevity
                 parts.append(str(span).removesuffix("</v>"))
             else:
                 parts.append(str(span))
 
-        return "".join(parts)
+        return "".join(parts) + "\n"
 
 
 class _WebVTTFile(BaseModel):
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index ea4f2889..b4d408cb 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -1,11 +1,20 @@
-# Assisted by watsonx Code Assistant
+"""Test the data model for WebVTT files.
 
+Assisted by watsonx Code Assistant.
+Examples extracted from https://www.w3.org/TR/webvtt1/
+Copyright © 2019 World Wide Web Consortium.
+"""
 
 import pytest
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    _WebVTTCueBlock,
+    _WebVTTCueComponentWithTerminator,
+    _WebVTTCueInternalText,
     _WebVTTCueItalicSpan,
+    _WebVTTCueLanguageSpan,
+    _WebVTTCueSpanStartTagAnnotated,
     _WebVTTCueTextSpan,
     _WebVTTCueTimings,
     _WebVTTCueVoiceSpan,
@@ -18,7 +27,7 @@
 GENERATE = GEN_TEST_DATA
 
 
-def test_vtt_cue_commponents():
+def test_vtt_cue_commponents() -> None:
     """Test WebVTT components."""
     valid_timestamps = [
         "00:01:02.345",
@@ -72,13 +81,13 @@ def test_vtt_cue_commponents():
     """Test invalid cue timings with missing end."""
     start = _WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)
+        _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
     end = _WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)
+        _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test with valid text."""
@@ -116,44 +125,105 @@ def test_vtt_cue_commponents():
     valid_annotation = "valid-annotation"
     invalid_annotation = "invalid\nannotation"
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
-    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+        _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
+    assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
 
     """Test that classes validation works correctly."""
     annotation = "speaker name"
     valid_classes = ["class1", "class2"]
     invalid_classes = ["class\nwith\nnewlines", ""]
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+        _WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation=annotation, classes=invalid_classes
+        )
+    assert _WebVTTCueSpanStartTagAnnotated(
+        name="v", annotation=annotation, classes=valid_classes
+    )
 
     """Test that components validation works correctly."""
     annotation = "speaker name"
-    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    valid_components = [
+        _WebVTTCueComponentWithTerminator(
+            component=_WebVTTCueTextSpan(text="random text")
+        )
+    ]
     invalid_components = [123, "not a component"]
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+        _WebVTTCueInternalText(components=invalid_components)
+    assert _WebVTTCueInternalText(components=valid_components)
 
     """Test valid cue voice spans."""
     cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        classes=["loud", "clear"],
-        components=[_WebVTTCueTextSpan(text="random text")],
+        start_tag=_WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation="speaker", classes=["loud", "clear"]
+        ),
+        internal_text=_WebVTTCueInternalText(
+            components=[
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
     )
-
     expected_str = "<v.loud.clear speaker>random text</v>"
     assert str(cue_span) == expected_str
 
     cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        components=[_WebVTTCueTextSpan(text="random text")],
+        start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
+        internal_text=_WebVTTCueInternalText(
+            components=[
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
     )
     expected_str = "<v speaker>random text</v>"
     assert str(cue_span) == expected_str
 
 
-def test_webvtt_file():
+def test_webvttcueblock_parse() -> None:
+    """Test the method parse of _WebVTTCueBlock class."""
+    raw: str = (
+        "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n"
+    )
+    block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:02.500 --> 04:05.000"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert (
+        block.payload[0].component.text
+        == "J’ai commencé le basket à l'âge de 13, 14 ans"
+    )
+    assert raw == str(block)
+
+    raw = (
+        "04:05.001 --> 04:07.800\n"
+        "Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier\n"
+    )
+    block = _WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:05.001 --> 04:07.800"
+    assert len(block.payload) == 3
+    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Sur les "
+    assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan)
+    assert len(block.payload[1].component.internal_text.components) == 1
+    lang_span = block.payload[1].component.internal_text.components[0].component
+    assert isinstance(lang_span, _WebVTTCueLanguageSpan)
+    assert isinstance(
+        lang_span.internal_text.components[0].component, _WebVTTCueTextSpan
+    )
+    assert lang_span.internal_text.components[0].component.text == "playground"
+    assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[2].component, _WebVTTCueTextSpan)
+    assert block.payload[2].component.text == ", ici à Montpellier"
+    assert raw == str(block)
+
+
+def test_webvtt_file() -> None:
     """Test WebVTT files."""
     with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -163,16 +233,16 @@ def test_webvtt_file():
     assert str(block.timings) == "00:32.500 --> 00:33.500"
     assert len(block.payload) == 1
     cue_span = block.payload[0]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert cue_span.annotation == "Neil deGrasse Tyson"
-    assert not cue_span.classes
-    assert len(cue_span.components) == 1
-    comp = cue_span.components[0]
-    assert isinstance(comp, _WebVTTCueItalicSpan)
-    assert len(comp.components) == 1
-    comp2 = comp.components[0]
-    assert isinstance(comp2, _WebVTTCueTextSpan)
-    assert comp2.text == "Laughs"
+    assert isinstance(cue_span.component, _WebVTTCueVoiceSpan)
+    assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.component.start_tag.classes
+    assert len(cue_span.component.internal_text.components) == 1
+    comp = cue_span.component.internal_text.components[0]
+    assert isinstance(comp.component, _WebVTTCueItalicSpan)
+    assert len(comp.component.internal_text.components) == 1
+    comp2 = comp.component.internal_text.components[0]
+    assert isinstance(comp2.component, _WebVTTCueTextSpan)
+    assert comp2.component.text == "Laughs"
 
     with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -182,8 +252,8 @@ def test_webvtt_file():
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
         "https://www.w3.org/TR/webvtt1/\n\n"
     )
-    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
-    assert content == reverse
+    reverse += "\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse.rstrip()
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -195,11 +265,10 @@ def test_webvtt_file():
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
     assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan)
     block = vtt.cue_blocks[2]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
     assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
-    assert block.payload[0].text == "Good."
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Good."

From 0122141cfc50d4e8448a8a2e5f427f440eb06899 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 27 Nov 2025 18:58:35 +0100
Subject: [PATCH 04/20] refactor(DoclingDocument): create a new provenance
 model for media file types

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/azure.py   |  17 +-
 docling_core/transforms/serializer/common.py  |  20 +-
 docling_core/transforms/serializer/doctags.py |  14 +-
 .../visualizer/key_value_visualizer.py        |  13 +-
 .../visualizer/layout_visualizer.py           |  16 +-
 .../visualizer/reading_order_visualizer.py    |   3 +-
 .../transforms/visualizer/table_visualizer.py |  11 +-
 docling_core/types/doc/__init__.py            |   1 +
 docling_core/types/doc/document.py            | 196 ++++++++++-----
 docling_core/types/doc/webvtt.py              |  73 ++----
 docling_core/utils/legacy.py                  |   8 +-
 docs/DoclingDocument.json                     | 229 ++++++++++++++++--
 12 files changed, 439 insertions(+), 162 deletions(-)

diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
index 385aca6a..ed91aee2 100644
--- a/docling_core/transforms/serializer/azure.py
+++ b/docling_core/transforms/serializer/azure.py
@@ -44,9 +44,10 @@
     DocSerializer,
     create_ser_result,
 )
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import (
+from docling_core.types.doc import (
+    CoordOrigin,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FormItem,
     InlineGroup,
@@ -54,12 +55,12 @@
     ListGroup,
     NodeItem,
     PictureItem,
+    ProvenanceItem,
     RefItem,
     RichTableCell,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
 
 
 def _bbox_to_polygon_coords(
@@ -76,7 +77,7 @@ def _bbox_to_polygon_coords(
 
 def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]:
     """Compute a TOPLEFT-origin polygon for the first provenance of the item."""
-    if not item.prov:
+    if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
         return None
 
     prov = item.prov[0]
@@ -187,7 +188,7 @@ def serialize(
 
         # Lists may be represented either as TextItem(ListItem) or via groups;
         # we treat any TextItem as a paragraph-like entry.
-        if item.prov:
+        if item.prov and isinstance(item.prov[0], ProvenanceItem):
             prov = item.prov[0]
             page_no = prov.page_no
             polygon = _bbox_to_polygon_for_item(doc, item)
@@ -237,7 +238,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov:
+        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
             return create_ser_result()
 
         prov = item.prov[0]
@@ -308,7 +309,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov:
+        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
             return create_ser_result()
 
         prov = item.prov[0]
@@ -324,7 +325,7 @@ def serialize(
         for foot_ref in item.footnotes:
             if isinstance(foot_ref, RefItem):
                 tgt = foot_ref.resolve(doc)
-                if isinstance(tgt, TextItem) and tgt.prov:
+                if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem):
                     f_poly = _bbox_to_polygon_for_item(doc, tgt)
                     if f_poly is not None:
                         foots.append(
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 3a8ad71c..c9c497f4 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -35,11 +35,11 @@
     SerializationResult,
     Span,
 )
-from docling_core.types.doc.document import (
-    DOCUMENT_TOKENS_EXPORT_LABELS,
+from docling_core.types.doc import (
     ContentLayer,
     DescriptionAnnotation,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FloatingItem,
     Formatting,
@@ -52,12 +52,13 @@
     PictureDataType,
     PictureItem,
     PictureMoleculeData,
+    ProvenanceItem,
     Script,
     TableAnnotationType,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS
 
 _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
 _DEFAULT_LAYERS = set(ContentLayer)
@@ -108,7 +109,7 @@ def _iterate_items(
                     add_page_breaks=add_page_breaks,
                     visited=my_visited,
                 ):
-                    if isinstance(it, DocItem) and it.prov:
+                    if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem):
                         page_no = it.prov[0].page_no
                         if prev_page_nr is not None and page_no > prev_page_nr:
                             yield (
@@ -120,7 +121,7 @@ def _iterate_items(
                                 lvl,
                             )
                         break
-            elif isinstance(item, DocItem) and item.prov:
+            elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem):
                 page_no = item.prov[0].page_no
                 if prev_page_nr is None or page_no > prev_page_nr:
                     if prev_page_nr is not None:  # close previous range
@@ -301,7 +302,13 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
                             or item.content_layer not in params.layers
                             or (
                                 params.pages is not None
-                                and ((not item.prov) or item.prov[0].page_no not in params.pages)
+                                and (
+                                    (not item.prov)
+                                    or (
+                                        isinstance(item.prov[0], ProvenanceItem)
+                                        and item.prov[0].page_no not in params.pages
+                                    )
+                                )
                             )
                         )
                     )
@@ -671,6 +678,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
             if (
                 isinstance(item, DocItem)
                 and item.prov
+                and isinstance(item.prov[0], ProvenanceItem)
                 and (self.params.pages is None or item.prov[0].page_no in self.params.pages)
                 and ix >= self.params.start_idx
                 and ix < self.params.stop_idx
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index e5672638..16549652 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -26,11 +26,13 @@
     _should_use_legacy_annotations,
     create_ser_result,
 )
-from docling_core.types.doc.base import BoundingBox
 from docling_core.types.doc.document import (
+    BoundingBox,
     CodeItem,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
+    DocumentToken,
     FloatingItem,
     FormItem,
     GroupItem,
@@ -40,6 +42,7 @@
     ListItem,
     NodeItem,
     PictureClassificationData,
+    PictureClassificationLabel,
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
@@ -47,10 +50,9 @@
     SectionHeaderItem,
     TableData,
     TableItem,
+    TableToken,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
-from docling_core.types.doc.tokens import DocumentToken, TableToken
 
 
 def _wrap(text: str, wrap_tag: str) -> str:
@@ -343,7 +345,7 @@ def serialize(
         results: list[SerializationResult] = []
 
         page_no = 1
-        if len(item.prov) > 0:
+        if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
             page_no = item.prov[0].page_no
 
         if params.add_location:
@@ -361,7 +363,7 @@ def serialize(
 
         for cell in item.graph.cells:
             cell_txt = ""
-            if cell.prov is not None:
+            if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
                 if len(doc.pages.keys()):
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
                     cell_txt += DocumentToken.get_location(
@@ -469,7 +471,7 @@ def _get_inline_location_tags(
         doc_items: list[DocItem] = []
         for it, _ in doc.iterate_items(root=item):
             if isinstance(it, DocItem):
-                for prov in it.prov:
+                for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
                     boxes.append(prov.bbox)
                     doc_items.append(it)
         if prov is None:
diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py
index 5ed7b843..e2b10264 100644
--- a/docling_core/transforms/visualizer/key_value_visualizer.py
+++ b/docling_core/transforms/visualizer/key_value_visualizer.py
@@ -16,8 +16,13 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument
-from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
+from docling_core.types.doc import (
+    ContentLayer,
+    DoclingDocument,
+    GraphCellLabel,
+    GraphLinkLabel,
+    ProvenanceItem,
+)
 
 # ---------------------------------------------------------------------------
 # Helper functions / constants
@@ -82,7 +87,7 @@ def _draw_key_value_layer(
             # First draw cells (rectangles + optional labels)
             # ------------------------------------------------------------------
             for cell in cell_dict.values():
-                if cell.prov is None or cell.prov.page_no != page_no:
+                if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no:
                     continue  # skip cells not on this page or without bbox
 
                 tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height)
@@ -149,6 +154,8 @@ def _draw_key_value_layer(
                 if (
                     src_cell.prov is None
                     or tgt_cell.prov is None
+                    or not isinstance(src_cell.prov, ProvenanceItem)
+                    or not isinstance(tgt_cell.prov, ProvenanceItem)
                     or src_cell.prov.page_no != page_no
                     or tgt_cell.prov.page_no != page_no
                 ):
diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py
index 369a7b38..8ac6bf81 100644
--- a/docling_core/transforms/visualizer/layout_visualizer.py
+++ b/docling_core/transforms/visualizer/layout_visualizer.py
@@ -10,10 +10,16 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc import DocItemLabel
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc import (
+    BoundingRectangle,
+    ContentLayer,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    ProvenanceItem,
+    TextCell,
+)
 
 
 class _TLBoundingRectangle(BoundingRectangle):
@@ -173,7 +179,7 @@ def _draw_doc_layout(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in elem.prov:
+            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
                 page_nr = prov.page_no
 
                 if page_nr in my_images:
diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py
index 60874333..27583613 100644
--- a/docling_core/transforms/visualizer/reading_order_visualizer.py
+++ b/docling_core/transforms/visualizer/reading_order_visualizer.py
@@ -14,6 +14,7 @@
     DocItem,
     DoclingDocument,
     PictureItem,
+    ProvenanceItem,
 )
 
 
@@ -130,7 +131,7 @@ def _draw_doc_reading_order(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in elem.prov:
+            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
                 page_no = prov.page_no
                 image = my_images.get(page_no)
 
diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py
index 489a6d9a..d3790d6b 100644
--- a/docling_core/transforms/visualizer/table_visualizer.py
+++ b/docling_core/transforms/visualizer/table_visualizer.py
@@ -10,7 +10,12 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
+from docling_core.types.doc import (
+    ContentLayer,
+    DoclingDocument,
+    ProvenanceItem,
+    TableItem,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -185,10 +190,10 @@ def _draw_doc_tables(
                 image = pil_img.copy()
                 my_images[page_nr] = image
 
-        for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
+        for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
             if not isinstance(elem, TableItem):
                 continue
-            if len(elem.prov) == 0:
+            if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
                 continue  # Skip elements without provenances
 
             if len(elem.prov) == 1:
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index 3c699f89..f0e0e92d 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -61,6 +61,7 @@
     Script,
     SectionHeaderItem,
     SummaryMetaField,
+    TableAnnotationType,
     TableCell,
     TableData,
     TableItem,
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 0ecc3e51..e071b2b9 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -32,10 +32,12 @@
     AnyUrl,
     BaseModel,
     ConfigDict,
+    Discriminator,
     Field,
     FieldSerializationInfo,
     SerializerFunctionWrapHandler,
     StringConstraints,
+    Tag,
     computed_field,
     field_serializer,
     field_validator,
@@ -65,6 +67,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.webvtt import _WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1155,11 +1158,81 @@ def from_multipage_doctags_and_images(
 
 
 class ProvenanceItem(BaseModel):
-    """ProvenanceItem."""
+    """Provenance information for elements extracted from a textual document.
 
-    page_no: int
-    bbox: BoundingBox
-    charspan: tuple[int, int]
+    A `ProvenanceItem` object acts as a lightweight pointer back into the original
+    document for an extracted element. It applies to documents with an explicity
+    or implicit layout, such as PDF, HTML, docx, or pptx.
+    """
+
+    page_no: Annotated[int, Field(description="Page number")]
+    bbox: Annotated[BoundingBox, Field(description="Bounding box")]
+    charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
+
+
+class ProvenanceTrack(BaseModel):
+    """Provenance information for elements extracted from media assets.
+
+    A `ProvenanceTrack` instance describes a cue in a text track associated with a
+    media element (audio, video, subtitles, screen recordings, ...).
+    """
+
+    start_time: Annotated[
+        _WebVTTTimestamp,
+        Field(
+            examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
+            description="Start time offset of the track cue",
+        ),
+    ]
+    end_time: Annotated[
+        _WebVTTTimestamp,
+        Field(
+            examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
+            description="End time offset of the track cue",
+        ),
+    ]
+    identifier: Optional[str] = Field(
+        None,
+        examples=["test", "123", "b72d946"],
+        description="An identifier of the cue",
+    )
+    voice: Optional[str] = Field(
+        None,
+        examples=["Mary", "Fred", "Name Surname"],
+        description="The cue voice (speaker)",
+    )
+    language: Optional[str] = Field(
+        None,
+        examples=["en", "en-GB", "fr-CA"],
+        description="Language of the cue in BCP 47 language tag format",
+    )
+    classes: Optional[list[str]] = Field(
+        None,
+        min_length=1,
+        examples=["first", "loud", "yellow"],
+        description="Classes for describing the cue significance",
+    )
+
+
+def get_provenance_discriminator_value(v: Any) -> str:
+    """Callable discriminator for provenance instances.
+
+    Args:
+        v: Either dict or model input.
+
+    Returns:
+        A string discriminator of provenance instances.
+    """
+    fields = {"bbox", "page_no", "charspan"}
+    if isinstance(v, dict):
+        return "item" if any(f in v for f in fields) else "track"
+    return "item" if any(hasattr(v, f) for f in fields) else "track"
+
+
+ProvenanceType = Annotated[
+    Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
+    Discriminator(get_provenance_discriminator_value),
+]
 
 
 class ContentLayer(str, Enum):
@@ -1468,7 +1541,7 @@ class DocItem(NodeItem):  # Base type for any element that carries content, can
     """DocItem."""
 
     label: DocItemLabel
-    prov: list[ProvenanceItem] = []
+    prov: list[ProvenanceType] = []
     comments: list[FineRef] = []  # References to comment items annotating this content
 
     @model_serializer(mode="wrap")
@@ -1493,7 +1566,7 @@ def get_location_tokens(
             return ""
 
         location = ""
-        for prov in self.prov:
+        for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)):
             page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
 
             loc_str = DocumentToken.get_location(
@@ -1515,10 +1588,13 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if a valid image of the page containing this DocItem is not available
         in doc.
         """
-        if not len(self.prov):
+        if not self.prov or prov_index >= len(self.prov):
+            return None
+        prov = self.prov[prov_index]
+        if not isinstance(prov, ProvenanceItem):
             return None
 
-        page = doc.pages.get(self.prov[prov_index].page_no)
+        page = doc.pages.get(prov.page_no)
         if page is None or page.size is None or page.image is None:
             return None
 
@@ -1526,9 +1602,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if not page_image:
             return None
         crop_bbox = (
-            self.prov[prov_index]
-            .bbox.to_top_left_origin(page_height=page.size.height)
-            .scale_to_size(old_size=page.size, new_size=page.image.size)
+            prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size(
+                old_size=page.size, new_size=page.image.size
+            )
             # .scaled(scale=page_image.height / page.size.height)
         )
         return page_image.crop(crop_bbox.as_tuple())
@@ -2199,7 +2275,7 @@ def export_to_otsl(
             return ""
 
         page_no = 0
-        if len(self.prov) > 0:
+        if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem):
             page_no = self.prov[0].page_no
 
         for i in range(nrows):
@@ -2329,7 +2405,7 @@ class GraphCell(BaseModel):
     text: str  # sanitized text
     orig: str  # text as seen on document
 
-    prov: Optional[ProvenanceItem] = None
+    prov: Optional[ProvenanceType] = None
 
     # in case you have a text, table or picture item
     item_ref: Optional[RefItem] = None
@@ -2978,7 +3054,7 @@ def add_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -2989,7 +3065,7 @@ def add_list_item(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3030,7 +3106,7 @@ def add_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3041,7 +3117,7 @@ def add_text(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3167,7 +3243,7 @@ def add_table(
         self,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
@@ -3177,7 +3253,7 @@ def add_table(
 
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
 
@@ -3213,7 +3289,7 @@ def add_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
     ):
@@ -3222,7 +3298,7 @@ def add_picture(
         :param data: Optional[list[PictureData]]: (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3254,7 +3330,7 @@ def add_title(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3265,7 +3341,7 @@ def add_title(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3300,7 +3376,7 @@ def add_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3313,7 +3389,7 @@ def add_code(
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3351,7 +3427,7 @@ def add_formula(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3362,7 +3438,7 @@ def add_formula(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3396,7 +3472,7 @@ def add_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3408,7 +3484,7 @@ def add_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3441,13 +3517,13 @@ def add_heading(
     def add_key_values(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_key_values.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3472,13 +3548,13 @@ def add_key_values(
     def add_form(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_form.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3667,7 +3743,7 @@ def insert_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3680,7 +3756,7 @@ def insert_list_item(
         :param enumerated: bool:  (Default value = False)
         :param marker: Optional[str]:  (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3739,7 +3815,7 @@ def insert_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3751,7 +3827,7 @@ def insert_text(
         :param label: DocItemLabel:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3851,7 +3927,7 @@ def insert_table(
         sibling: NodeItem,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
         annotations: Optional[list[TableAnnotationType]] = None,
@@ -3862,7 +3938,7 @@ def insert_table(
         :param sibling: NodeItem:
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param annotations: Optional[list[TableAnnotationType]]: (Default value = None)
@@ -3899,7 +3975,7 @@ def insert_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         after: bool = True,
     ) -> PictureItem:
@@ -3909,7 +3985,7 @@ def insert_picture(
         :param annotations: Optional[list[PictureDataType]]: (Default value = None)
         :param image: Optional[ImageRef]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
@@ -3943,7 +4019,7 @@ def insert_title(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3954,7 +4030,7 @@ def insert_title(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3994,7 +4070,7 @@ def insert_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4007,7 +4083,7 @@ def insert_code(
         :param code_language: Optional[str]: (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4049,7 +4125,7 @@ def insert_formula(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4060,7 +4136,7 @@ def insert_formula(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4099,7 +4175,7 @@ def insert_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4111,7 +4187,7 @@ def insert_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4149,14 +4225,14 @@ def insert_key_values(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         after: bool = True,
     ) -> KeyValueItem:
         """Creates a new KeyValueItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: KeyValueItem: The newly created KeyValueItem item.
@@ -4178,14 +4254,14 @@ def insert_form(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         after: bool = True,
     ) -> FormItem:
         """Creates a new FormItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: FormItem: The newly created FormItem item.
@@ -4522,7 +4598,10 @@ def _iterate_items_with_stack(
             (not isinstance(root, GroupItem) or with_groups)
             and (
                 not isinstance(root, DocItem)
-                or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov))
+                or (
+                    page_nrs is None
+                    or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem))
+                )
             )
             and root.content_layer in my_layers
         )
@@ -4625,7 +4704,7 @@ def _with_pictures_refs(
         image_dir.mkdir(parents=True, exist_ok=True)
 
         if image_dir.is_dir():
-            for item, level in result.iterate_items(page_no=page_no, with_groups=False):
+            for item, _ in result.iterate_items(page_no=page_no, with_groups=False):
                 if isinstance(item, PictureItem):
                     img = item.get_image(doc=self)
                     if img is not None:
@@ -4644,10 +4723,11 @@ def _with_pictures_refs(
                             else:
                                 obj_path = loc_path
 
-                            if item.image is None:
+                            if item.image is None and isinstance(item.prov[0], ProvenanceItem):
                                 scale = img.size[0] / item.prov[0].bbox.width
                                 item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale))
-                            item.image.uri = Path(obj_path)
+                            elif item.image is not None:
+                                item.image.uri = Path(obj_path)
 
                         # if item.image._pil is not None:
                         #    item.image._pil.close()
@@ -6049,7 +6129,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
                     if isinstance(new_item, DocItem):
                         # update page numbers
                         # NOTE other prov sources (e.g. GraphCell) currently not covered
-                        for prov in new_item.prov:
+                        for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)):
                             prov.page_no += page_delta
 
                     if item.parent:
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6d60a2d8..bddd6140 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -23,9 +23,7 @@ class _WebVTTLineTerminator(str, Enum):
     CR = "\r"
 
 
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
+_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
 class _WebVTTTimestamp(BaseModel):
@@ -39,14 +37,10 @@ class _WebVTTTimestamp(BaseModel):
 
     raw: Annotated[
         str,
-        Field(
-            description="A representation of the WebVTT Timestamp as a single string"
-        ),
+        Field(description="A representation of the WebVTT Timestamp as a single string"),
     ]
 
-    _pattern: ClassVar[re.Pattern] = re.compile(
-        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
-    )
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$")
     _hours: int
     _minutes: int
     _seconds: int
@@ -72,12 +66,7 @@ def validate_raw(self) -> Self:
     @property
     def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
-        return (
-            self._hours * 3600
-            + self._minutes * 60
-            + self._seconds
-            + self._millis / 1000.0
-        )
+        return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
     @override
     def __str__(self) -> str:
@@ -87,9 +76,7 @@ def __str__(self) -> str:
 class _WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
-    start: Annotated[
-        _WebVTTTimestamp, Field(description="Start time offset of the cue")
-    ]
+    start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")]
     end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
 
     @model_validator(mode="after")
@@ -116,9 +103,7 @@ def is_valid_text(cls, value: str) -> str:
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
-                raise ValueError(
-                    f"Cue text contains an invalid HTML entity: &{entity};"
-                )
+                raise ValueError(f"Cue text contains an invalid HTML entity: &{entity};")
         if "&" in re.sub(_ENTITY_PATTERN, "", value):
             raise ValueError("Found '&' not part of a valid entity in the cue text")
         if any(ch in value for ch in {"\n", "\r", "<"}):
@@ -150,20 +135,12 @@ class _WebVTTCueInternalText(BaseModel):
     terminator: Optional[_WebVTTLineTerminator] = None
     components: Annotated[
         list[_WebVTTCueComponentWithTerminator],
-        Field(
-            description=(
-                "WebVTT caption or subtitle cue components representing the "
-                "cue internal text"
-            )
-        ),
+        Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
     ] = []
 
     @override
     def __str__(self):
-        cue_str = (
-            f"{self.terminator.value if self.terminator else ''}"
-            f"{''.join(str(span) for span in self.components)}"
-        )
+        cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
 
 
@@ -181,9 +158,7 @@ class _WebVTTCueSpanStartTag(BaseModel):
     def validate_classes(cls, value: list[str]) -> list[str]:
         for item in value:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
-                raise ValueError(
-                    "A cue span start tag class contains invalid characters"
-                )
+                raise ValueError("A cue span start tag class contains invalid characters")
             if not item:
                 raise ValueError("Cue span start tag classes cannot be empty")
         return value
@@ -207,9 +182,7 @@ def is_valid_annotation(cls, value: str) -> str:
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
-                raise ValueError(
-                    f"Annotation contains an invalid HTML entity: &{entity};"
-                )
+                raise ValueError(f"Annotation contains an invalid HTML entity: &{entity};")
         if "&" in re.sub(_ENTITY_PATTERN, "", value):
             raise ValueError("Found '&' not part of a valid entity in annotation")
         if any(ch in value for ch in {"\n", "\r", ">"}):
@@ -324,9 +297,7 @@ class _WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[_WebVTTCueIdentifier] = Field(
-        None, description="The WebVTT cue identifier"
-    )
+    identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
     timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[_WebVTTCueComponentWithTerminator],
@@ -368,9 +339,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(
-            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
-        )
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -388,9 +357,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             match = matches[i]
             if match.start() > pos:
                 stack[-1].append(
-                    _WebVTTCueComponentWithTerminator(
-                        component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])
-                    )
+                    _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
                 )
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
@@ -410,9 +377,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
                         st = (
-                            _WebVTTCueSpanStartTagAnnotated(
-                                name=ct, classes=classes, annotation=annotation.strip()
-                            )
+                            _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
                             if annotation
                             else _WebVTTCueSpanStartTag(name=ct, classes=classes)
                         )
@@ -430,19 +395,13 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                             cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
                         elif ct == "v":
                             cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
-                        stack[-1].append(
-                            _WebVTTCueComponentWithTerminator(component=cp)
-                        )
+                        stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp))
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text=cue_text[pos:])
-                )
-            )
+            stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:])))
 
         return cls(
             identifier=identifier,
diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
index 04761799..5ebac4be 100644
--- a/docling_core/utils/legacy.py
+++ b/docling_core/utils/legacy.py
@@ -7,20 +7,23 @@
 
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     CoordOrigin,
     DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    GroupItem,
+    ListItem,
     PictureItem,
     ProvenanceItem,
     SectionHeaderItem,
     Size,
     TableCell,
+    TableData,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
 from docling_core.types.legacy_doc.base import (
     BaseCell,
     BaseText,
@@ -162,6 +165,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
                         span=[0, len(item.text)],
                     )
                     for p in item.prov
+                    if isinstance(p, ProvenanceItem)
                 ]
                 main_text.append(
                     BaseText(
@@ -283,6 +287,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, 0],
                             )
                             for p in item.prov
+                            if isinstance(p, ProvenanceItem)
                         ],
                     )
                 )
@@ -310,6 +315,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, len(caption)],
                             )
                             for p in item.prov
+                            if isinstance(p, ProvenanceItem)
                         ],
                         obj_type=doc_item_label_to_legacy_type(item.label),
                         text=caption,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 03b7d8cd..eca74ef4 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -233,7 +233,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -651,7 +658,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -793,7 +807,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -873,13 +894,21 @@
         "prov": {
           "anyOf": [
             {
-              "$ref": "#/$defs/ProvenanceItem"
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/ProvenanceItem"
+                },
+                {
+                  "$ref": "#/$defs/ProvenanceTrack"
+                }
+              ]
             },
             {
               "type": "null"
             }
           ],
-          "default": null
+          "default": null,
+          "title": "Prov"
         },
         "item_ref": {
           "anyOf": [
@@ -1198,7 +1227,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -1370,7 +1406,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -1746,7 +1789,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2139,16 +2189,19 @@
       "type": "object"
     },
     "ProvenanceItem": {
-      "description": "ProvenanceItem.",
+      "description": "Provenance information for elements extracted from a textual document.\n\nA `ProvenanceItem` object acts as a lightweight pointer back into the original\ndocument for an extracted element. It applies to documents with an explicity\nor implicit layout, such as PDF, HTML, docx, or pptx.",
       "properties": {
         "page_no": {
+          "description": "Page number",
           "title": "Page No",
           "type": "integer"
         },
         "bbox": {
-          "$ref": "#/$defs/BoundingBox"
+          "$ref": "#/$defs/BoundingBox",
+          "description": "Bounding box"
         },
         "charspan": {
+          "description": "Character span (0-indexed)",
           "maxItems": 2,
           "minItems": 2,
           "prefixItems": [
@@ -2171,6 +2224,111 @@
       "title": "ProvenanceItem",
       "type": "object"
     },
+    "ProvenanceTrack": {
+      "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
+      "properties": {
+        "start_time": {
+          "$ref": "#/$defs/_WebVTTTimestamp",
+          "description": "Start time offset of the track cue",
+          "examples": [
+            "00.11.000",
+            "00:00:06.500",
+            "01:28:34.300"
+          ]
+        },
+        "end_time": {
+          "$ref": "#/$defs/_WebVTTTimestamp",
+          "description": "End time offset of the track cue",
+          "examples": [
+            "00.12.000",
+            "00:00:08.200",
+            "01:29:30.100"
+          ]
+        },
+        "identifier": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "An identifier of the cue",
+          "examples": [
+            "test",
+            "123",
+            "b72d946"
+          ],
+          "title": "Identifier"
+        },
+        "voice": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The cue voice (speaker)",
+          "examples": [
+            "Mary",
+            "Fred",
+            "Name Surname"
+          ],
+          "title": "Voice"
+        },
+        "language": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Language of the cue in BCP 47 language tag format",
+          "examples": [
+            "en",
+            "en-GB",
+            "fr-CA"
+          ],
+          "title": "Language"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "minItems": 1,
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Classes for describing the cue significance",
+          "examples": [
+            "first",
+            "loud",
+            "yellow"
+          ],
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "ProvenanceTrack",
+      "type": "object"
+    },
     "RefItem": {
       "description": "RefItem.",
       "properties": {
@@ -2327,7 +2485,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2622,7 +2787,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2827,7 +2999,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2939,7 +3118,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2997,6 +3183,21 @@
       ],
       "title": "TitleItem",
       "type": "object"
+    },
+    "_WebVTTTimestamp": {
+      "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
+      "properties": {
+        "raw": {
+          "description": "A representation of the WebVTT Timestamp as a single string",
+          "title": "Raw",
+          "type": "string"
+        }
+      },
+      "required": [
+        "raw"
+      ],
+      "title": "_WebVTTTimestamp",
+      "type": "object"
     }
   },
   "description": "DoclingDocument.",

From b9bb0535ff4fe0fe6e9d9488fc22fc8d1a5151f9 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 4 Dec 2025 14:49:53 +0100
Subject: [PATCH 05/20] refactor(webvtt): make WebVTTTimestamp public

Since WebVTTTimestamp is used in DoclingDocument, the class should be public.
Strengthen validation of cue language start tag annotation.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/__init__.py |  1 +
 docling_core/types/doc/document.py |  6 ++--
 docling_core/types/doc/webvtt.py   | 50 ++++++++++++++++++++++++------
 docs/DoclingDocument.json          | 10 +++---
 test/test_webvtt.py                | 31 ++++++++++++------
 5 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index f0e0e92d..d8ddd0b4 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -56,6 +56,7 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
+    ProvenanceTrack,
     RefItem,
     RichTableCell,
     Script,
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index e071b2b9..acb8b7a5 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -67,7 +67,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
-from docling_core.types.doc.webvtt import _WebVTTTimestamp
+from docling_core.types.doc.webvtt import WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1178,14 +1178,14 @@ class ProvenanceTrack(BaseModel):
     """
 
     start_time: Annotated[
-        _WebVTTTimestamp,
+        WebVTTTimestamp,
         Field(
             examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
             description="Start time offset of the track cue",
         ),
     ]
     end_time: Annotated[
-        _WebVTTTimestamp,
+        WebVTTTimestamp,
         Field(
             examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
             description="End time offset of the track cue",
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index bddd6140..f6a6ea73 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -26,9 +26,18 @@ class _WebVTTLineTerminator(str, Enum):
 _WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
-class _WebVTTTimestamp(BaseModel):
+class WebVTTTimestamp(BaseModel):
     """WebVTT timestamp.
 
+    The timestamp is a string consisting of the following components in the given order:
+
+    - hours (optional, required if non-zero): two or more digits
+    - minutes: two digits between 0 and 59
+    - a colon character (:)
+    - seconds: two digits between 0 and 59
+    - a full stop character (.)
+    - thousandths of a second: three digits
+
     A WebVTT timestamp is always interpreted relative to the current playback position
     of the media data that the WebVTT file is to be synchronized with.
     """
@@ -48,6 +57,7 @@ class _WebVTTTimestamp(BaseModel):
 
     @model_validator(mode="after")
     def validate_raw(self) -> Self:
+        """Validate the WebVTT timestamp as a string."""
         m = self._pattern.match(self.raw)
         if not m:
             raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
@@ -70,14 +80,15 @@ def seconds(self) -> float:
 
     @override
     def __str__(self) -> str:
+        """Return a string representation of a WebVTT timestamp."""
         return self.raw
 
 
 class _WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
-    start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")]
-    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+    start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")]
+    end: Annotated[WebVTTTimestamp, Field(description="End time offset of the cue")]
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
@@ -197,6 +208,21 @@ def __str__(self):
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
+class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE)
+
+    name: Literal["lang"] = Field("lang", description="The tag name")
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    @override
+    def is_valid_annotation(cls, value: str) -> str:
+        if cls._pattern.match(value):
+            return value
+        else:
+            raise ValueError("Annotation should be in BCP 47 language tag format")
+
+
 class _WebVTTCueComponentBase(BaseModel):
     """WebVTT caption or subtitle cue component.
 
@@ -267,7 +293,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["lang"] = "lang"
-    start_tag: _WebVTTCueSpanStartTagAnnotated
+    start_tag: _WebVTTCueLanguageSpanStartTag
 
 
 _WebVTTCueComponent = Annotated[
@@ -339,7 +365,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end))
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -376,11 +402,15 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         classes: list[str] = []
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
-                        st = (
-                            _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
-                            if annotation
-                            else _WebVTTCueSpanStartTag(name=ct, classes=classes)
-                        )
+                        st: _WebVTTCueSpanStartTag
+                        if annotation and ct == "lang":
+                            st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
+                        elif annotation:
+                            st = _WebVTTCueSpanStartTagAnnotated(
+                                name=ct, classes=classes, annotation=annotation.strip()
+                            )
+                        else:
+                            st = _WebVTTCueSpanStartTag(name=ct, classes=classes)
                         it = _WebVTTCueInternalText(components=children)
                         cp: _WebVTTCueComponent
                         if ct == "c":
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index eca74ef4..adc3aac5 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2228,7 +2228,7 @@
       "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
       "properties": {
         "start_time": {
-          "$ref": "#/$defs/_WebVTTTimestamp",
+          "$ref": "#/$defs/WebVTTTimestamp",
           "description": "Start time offset of the track cue",
           "examples": [
             "00.11.000",
@@ -2237,7 +2237,7 @@
           ]
         },
         "end_time": {
-          "$ref": "#/$defs/_WebVTTTimestamp",
+          "$ref": "#/$defs/WebVTTTimestamp",
           "description": "End time offset of the track cue",
           "examples": [
             "00.12.000",
@@ -3184,8 +3184,8 @@
       "title": "TitleItem",
       "type": "object"
     },
-    "_WebVTTTimestamp": {
-      "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
+    "WebVTTTimestamp": {
+      "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
       "properties": {
         "raw": {
           "description": "A representation of the WebVTT Timestamp as a single string",
@@ -3196,7 +3196,7 @@
       "required": [
         "raw"
       ],
-      "title": "_WebVTTTimestamp",
+      "title": "WebVTTTimestamp",
       "type": "object"
     }
   },
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index b4d408cb..f4013831 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -9,17 +9,18 @@
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    WebVTTTimestamp,
     _WebVTTCueBlock,
     _WebVTTCueComponentWithTerminator,
     _WebVTTCueInternalText,
     _WebVTTCueItalicSpan,
     _WebVTTCueLanguageSpan,
+    _WebVTTCueLanguageSpanStartTag,
     _WebVTTCueSpanStartTagAnnotated,
     _WebVTTCueTextSpan,
     _WebVTTCueTimings,
     _WebVTTCueVoiceSpan,
     _WebVTTFile,
-    _WebVTTTimestamp,
 )
 
 from .test_data_gen_flag import GEN_TEST_DATA
@@ -42,7 +43,7 @@ def test_vtt_cue_commponents() -> None:
         0.0,
     ]
     for idx, ts in enumerate(valid_timestamps):
-        model = _WebVTTTimestamp(raw=ts)
+        model = WebVTTTimestamp(raw=ts)
         assert model.seconds == valid_total_seconds[idx]
 
     """Test invalid WebVTT timestamps."""
@@ -57,35 +58,35 @@ def test_vtt_cue_commponents() -> None:
     ]
     for ts in invalid_timestamps:
         with pytest.raises(ValidationError):
-            _WebVTTTimestamp(raw=ts)
+            WebVTTTimestamp(raw=ts)
 
     """Test the timestamp __str__ method."""
-    model = _WebVTTTimestamp(raw="00:01:02.345")
+    model = WebVTTTimestamp(raw="00:01:02.345")
     assert str(model) == "00:01:02.345"
 
     """Test valid cue timings."""
-    start = _WebVTTTimestamp(raw="00:10.005")
-    end = _WebVTTTimestamp(raw="00:14.007")
+    start = WebVTTTimestamp(raw="00:10.005")
+    end = WebVTTTimestamp(raw="00:14.007")
     cue_timings = _WebVTTCueTimings(start=start, end=end)
     assert cue_timings.start == start
     assert cue_timings.end == end
     assert str(cue_timings) == "00:10.005 --> 00:14.007"
 
     """Test invalid cue timings with end timestamp before start."""
-    start = _WebVTTTimestamp(raw="00:10.700")
-    end = _WebVTTTimestamp(raw="00:10.500")
+    start = WebVTTTimestamp(raw="00:10.700")
+    end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(start=start, end=end)
     assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
 
     """Test invalid cue timings with missing end."""
-    start = _WebVTTTimestamp(raw="00:10.500")
+    start = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
-    end = _WebVTTTimestamp(raw="00:10.500")
+    end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
@@ -272,3 +273,13 @@ def test_webvtt_file() -> None:
     assert len(block.payload) == 1
     assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
+
+
+def test_webvtt_cue_language_span_start_tag():
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')

From b26c08662c5488b33e6ebb3043c05ae754ad4a43 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 10:49:23 +0100
Subject: [PATCH 06/20] refactor(webvtt): set languages to a list of strings in
 ProvenanceTrack

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py |  6 +++---
 docs/DoclingDocument.json          | 21 ++++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index acb8b7a5..6f02c54f 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1201,10 +1201,10 @@ class ProvenanceTrack(BaseModel):
         examples=["Mary", "Fred", "Name Surname"],
         description="The cue voice (speaker)",
     )
-    language: Optional[str] = Field(
+    languages: Optional[list[str]] = Field(
         None,
-        examples=["en", "en-GB", "fr-CA"],
-        description="Language of the cue in BCP 47 language tag format",
+        examples=[["en", "en-GB"], ["fr-CA"]],
+        description="Languages of the cue in BCP 47 language tag format",
     )
     classes: Optional[list[str]] = Field(
         None,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index adc3aac5..35175601 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2281,23 +2281,30 @@
           ],
           "title": "Voice"
         },
-        "language": {
+        "languages": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "Language of the cue in BCP 47 language tag format",
+          "description": "Languages of the cue in BCP 47 language tag format",
           "examples": [
-            "en",
-            "en-GB",
-            "fr-CA"
+            [
+              "en",
+              "en-GB"
+            ],
+            [
+              "fr-CA"
+            ]
           ],
-          "title": "Language"
+          "title": "Languages"
         },
         "classes": {
           "anyOf": [

From d0c97fcfa128d2b9277692cf45ede7f4cb4781af Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 11:04:59 +0100
Subject: [PATCH 07/20] tests(webvtt): add test for ProvenanceTrack

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 test/test_doc_base.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 709e2eac..18d2cf11 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,6 +1,8 @@
 import pytest
 from pydantic import ValidationError
 
+from docling_core.types.doc.document import ProvenanceTrack
+from docling_core.types.doc.webvtt import WebVTTTimestamp
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -37,3 +39,34 @@ def test_prov():
     with pytest.raises(ValidationError, match="at least 2 items"):
         prov["span"] = [0]
         Prov(**prov)
+
+
+def test_prov_track():
+    """Test the class ProvenanceTrack."""
+
+    valid_track = ProvenanceTrack(
+        start_time=WebVTTTimestamp(raw="00:11.000"),
+        end_time=WebVTTTimestamp(raw="00:12.000"),
+        identifier="test",
+        voice="Mary",
+        languages=["en", "en-GB"],
+        classes=["v.first.loud", "i.foreignphrase"],
+    )
+
+    assert valid_track
+    assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000")
+    assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000")
+    assert valid_track.identifier == "test"
+    assert valid_track.voice == "Mary"
+    assert valid_track.languages == ["en", "en-GB"]
+    assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
+
+    with pytest.raises(ValidationError, match="end_time"):
+        ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000"))
+
+    with pytest.raises(ValidationError, match="should be a valid list"):
+        ProvenanceTrack(
+            start_time=WebVTTTimestamp(raw="00:11.000"),
+            end_time=WebVTTTimestamp(raw="00:12.000"),
+            languages="en",
+        )

From 86d7fe49a8ee400c21be6527d9be02a04d0f6484 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 11:29:51 +0100
Subject: [PATCH 08/20] refactor(webvtt): make all WebVTT classes public for
 reuse

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 156 +++++++++++++++++++------------
 test/test_webvtt.py              | 130 +++++++++++++-------------
 2 files changed, 161 insertions(+), 125 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index f6a6ea73..550498a9 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -84,7 +84,7 @@ def __str__(self) -> str:
         return self.raw
 
 
-class _WebVTTCueTimings(BaseModel):
+class WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
     start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")]
@@ -92,6 +92,7 @@ class _WebVTTCueTimings(BaseModel):
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
+        """Ensure start timestamp is less than or equal to end timestamp."""
         if self.start and self.end:
             if self.end.seconds <= self.start.seconds:
                 raise ValueError("End timestamp must be greater than start timestamp")
@@ -99,10 +100,11 @@ def check_order(self) -> Self:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue timings."""
         return f"{self.start} --> {self.end}"
 
 
-class _WebVTTCueTextSpan(BaseModel):
+class WebVTTCueTextSpan(BaseModel):
     """WebVTT cue text span."""
 
     kind: Literal["text"] = "text"
@@ -111,6 +113,7 @@ class _WebVTTCueTextSpan(BaseModel):
     @field_validator("text", mode="after")
     @classmethod
     def is_valid_text(cls, value: str) -> str:
+        """Ensure cue text contains only permitted characters and HTML entities."""
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
@@ -126,36 +129,39 @@ def is_valid_text(cls, value: str) -> str:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue text span."""
         return self.text
 
 
-class _WebVTTCueComponentWithTerminator(BaseModel):
+class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
-    component: "_WebVTTCueComponent"
+    component: "WebVTTCueComponent"
     terminator: Optional[_WebVTTLineTerminator] = None
 
     @override
     def __str__(self):
+        """Return a string representation of the cue component with terminator."""
         return f"{self.component}{self.terminator.value if self.terminator else ''}"
 
 
-class _WebVTTCueInternalText(BaseModel):
+class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
     terminator: Optional[_WebVTTLineTerminator] = None
     components: Annotated[
-        list[_WebVTTCueComponentWithTerminator],
+        list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
     ] = []
 
     @override
     def __str__(self):
+        """Return a string representation of the cue internal text."""
         cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
 
 
-class _WebVTTCueSpanStartTag(BaseModel):
+class WebVTTCueSpanStartTag(BaseModel):
     """WebVTT cue span start tag."""
 
     name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
@@ -167,6 +173,7 @@ class _WebVTTCueSpanStartTag(BaseModel):
     @field_validator("classes", mode="after")
     @classmethod
     def validate_classes(cls, value: list[str]) -> list[str]:
+        """Validate cue span start tag classes."""
         for item in value:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                 raise ValueError("A cue span start tag class contains invalid characters")
@@ -175,14 +182,16 @@ def validate_classes(cls, value: list[str]) -> list[str]:
         return value
 
     def _get_name_with_classes(self) -> str:
+        """Return the name of the cue span start tag with classes."""
         return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
 
     @override
     def __str__(self):
+        """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()}>"
 
 
-class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
+class WebVTTCueSpanStartTagAnnotated(WebVTTCueSpanStartTag):
     """WebVTT cue span start tag requiring an annotation."""
 
     annotation: Annotated[str, Field(description="Cue span start tag annotation")]
@@ -190,6 +199,7 @@ class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
     @field_validator("annotation", mode="after")
     @classmethod
     def is_valid_annotation(cls, value: str) -> str:
+        """Ensure annotation contains only permitted characters and HTML entities."""
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
@@ -205,10 +215,13 @@ def is_valid_annotation(cls, value: str) -> str:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
-class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
+class WebVTTCueLanguageSpanStartTag(WebVTTCueSpanStartTagAnnotated):
+    """WebVTT cue language span start tag."""
+
     _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE)
 
     name: Literal["lang"] = Field("lang", description="The tag name")
@@ -217,13 +230,14 @@ class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
     @classmethod
     @override
     def is_valid_annotation(cls, value: str) -> str:
+        """Ensure that the language annotation is in BCP 47 language tag format."""
         if cls._pattern.match(value):
             return value
         else:
             raise ValueError("Annotation should be in BCP 47 language tag format")
 
 
-class _WebVTTCueComponentBase(BaseModel):
+class WebVTTCueComponentBase(BaseModel):
     """WebVTT caption or subtitle cue component.
 
     All the WebVTT caption or subtitle cue components are represented by this class
@@ -231,28 +245,30 @@ class _WebVTTCueComponentBase(BaseModel):
     """
 
     kind: Literal["c", "b", "i", "u", "v", "lang"]
-    start_tag: _WebVTTCueSpanStartTag
-    internal_text: _WebVTTCueInternalText
+    start_tag: WebVTTCueSpanStartTag
+    internal_text: WebVTTCueInternalText
 
     @model_validator(mode="after")
     def check_tag_names_match(self) -> Self:
+        """Ensure that the start tag name matches this cue component type."""
         if self.kind != self.start_tag.name:
             raise ValueError("The tag name of this cue component should be {self.kind}")
         return self
 
     @override
     def __str__(self):
+        """Return a string representation of the cue component."""
         return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
 
 
-class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase):
+class WebVTTCueVoiceSpan(WebVTTCueComponentBase):
     """WebVTT cue voice span associated with a specific voice."""
 
     kind: Literal["v"] = "v"
-    start_tag: _WebVTTCueSpanStartTagAnnotated
+    start_tag: WebVTTCueSpanStartTagAnnotated
 
 
-class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
+class WebVTTCueClassSpan(WebVTTCueComponentBase):
     """WebVTT cue class span.
 
     It represents a span of text and it is used to annotate parts of the cue with
@@ -260,31 +276,31 @@ class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["c"] = "c"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="c")
 
 
-class _WebVTTCueItalicSpan(_WebVTTCueComponentBase):
+class WebVTTCueItalicSpan(WebVTTCueComponentBase):
     """WebVTT cue italic span representing a span of italic text."""
 
     kind: Literal["i"] = "i"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="i")
 
 
-class _WebVTTCueBoldSpan(_WebVTTCueComponentBase):
+class WebVTTCueBoldSpan(WebVTTCueComponentBase):
     """WebVTT cue bold span representing a span of bold text."""
 
     kind: Literal["b"] = "b"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="b")
 
 
-class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase):
+class WebVTTCueUnderlineSpan(WebVTTCueComponentBase):
     """WebVTT cue underline span representing a span of underline text."""
 
     kind: Literal["u"] = "u"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="u")
 
 
-class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
+class WebVTTCueLanguageSpan(WebVTTCueComponentBase):
     """WebVTT cue language span.
 
     It represents a span of text and it is used to annotate parts of the cue where the
@@ -293,18 +309,18 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["lang"] = "lang"
-    start_tag: _WebVTTCueLanguageSpanStartTag
+    start_tag: WebVTTCueLanguageSpanStartTag
 
 
-_WebVTTCueComponent = Annotated[
+WebVTTCueComponent = Annotated[
     Union[
-        _WebVTTCueTextSpan,
-        _WebVTTCueClassSpan,
-        _WebVTTCueItalicSpan,
-        _WebVTTCueBoldSpan,
-        _WebVTTCueUnderlineSpan,
-        _WebVTTCueVoiceSpan,
-        _WebVTTCueLanguageSpan,
+        WebVTTCueTextSpan,
+        WebVTTCueClassSpan,
+        WebVTTCueItalicSpan,
+        WebVTTCueBoldSpan,
+        WebVTTCueUnderlineSpan,
+        WebVTTCueVoiceSpan,
+        WebVTTCueLanguageSpan,
     ],
     Field(
         discriminator="kind",
@@ -313,7 +329,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
 ]
 
 
-class _WebVTTCueBlock(BaseModel):
+class WebVTTCueBlock(BaseModel):
     """Model representing a WebVTT cue block.
 
     The optional WebVTT cue settings list is not supported.
@@ -324,9 +340,9 @@ class _WebVTTCueBlock(BaseModel):
     model_config = ConfigDict(regex_engine="python-re")
 
     identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
-    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
-        list[_WebVTTCueComponentWithTerminator],
+        list[WebVTTCueComponentWithTerminator],
         Field(description="The WebVTT caption or subtitle cue text"),
     ]
 
@@ -341,13 +357,22 @@ class _WebVTTCueBlock(BaseModel):
     @field_validator("payload", mode="after")
     @classmethod
     def validate_payload(cls, payload):
+        """Ensure that the cue payload contains valid text."""
         for voice in payload:
             if "-->" in str(voice):
                 raise ValueError("Cue payload must not contain '-->'")
         return payload
 
     @classmethod
-    def parse(cls, raw: str) -> "_WebVTTCueBlock":
+    def parse(cls, raw: str) -> "WebVTTCueBlock":
+        """Parse a WebVTT cue block from a string.
+
+        Args:
+            raw: The raw WebVTT cue block string.
+
+        Returns:
+            The parsed WebVTT cue block.
+        """
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
@@ -365,7 +390,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
+        timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -373,7 +398,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                 cue_text += f"</{omm}>"
                 break
 
-        stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]]
+        stack: list[list[WebVTTCueComponentWithTerminator]] = [[]]
         tag_stack: list[dict] = []
 
         pos = 0
@@ -383,7 +408,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             match = matches[i]
             if match.start() > pos:
                 stack[-1].append(
-                    _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+                    WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
                 )
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
@@ -402,36 +427,34 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         classes: list[str] = []
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
-                        st: _WebVTTCueSpanStartTag
+                        st: WebVTTCueSpanStartTag
                         if annotation and ct == "lang":
-                            st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
+                            st = WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
                         elif annotation:
-                            st = _WebVTTCueSpanStartTagAnnotated(
-                                name=ct, classes=classes, annotation=annotation.strip()
-                            )
+                            st = WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
                         else:
-                            st = _WebVTTCueSpanStartTag(name=ct, classes=classes)
-                        it = _WebVTTCueInternalText(components=children)
-                        cp: _WebVTTCueComponent
+                            st = WebVTTCueSpanStartTag(name=ct, classes=classes)
+                        it = WebVTTCueInternalText(components=children)
+                        cp: WebVTTCueComponent
                         if ct == "c":
-                            cp = _WebVTTCueClassSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueClassSpan(start_tag=st, internal_text=it)
                         elif ct == "b":
-                            cp = _WebVTTCueBoldSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueBoldSpan(start_tag=st, internal_text=it)
                         elif ct == "i":
-                            cp = _WebVTTCueItalicSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueItalicSpan(start_tag=st, internal_text=it)
                         elif ct == "u":
-                            cp = _WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
                         elif ct == "lang":
-                            cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
                         elif ct == "v":
-                            cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
-                        stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp))
+                            cp = WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
+                        stack[-1].append(WebVTTCueComponentWithTerminator(component=cp))
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:])))
+            stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:])))
 
         return cls(
             identifier=identifier,
@@ -440,6 +463,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
         )
 
     def __str__(self):
+        """Return a string representation of the WebVTT cue block."""
         parts = []
         if self.identifier:
             parts.append(f"{self.identifier}\n")
@@ -455,13 +479,14 @@ def __str__(self):
         return "".join(parts) + "\n"
 
 
-class _WebVTTFile(BaseModel):
+class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
-    cue_blocks: list[_WebVTTCueBlock]
+    cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
     def verify_signature(content: str) -> bool:
+        """Verify the WebVTT file signature."""
         if not content:
             return False
         elif len(content) == 6:
@@ -472,7 +497,15 @@ def verify_signature(content: str) -> bool:
             return False
 
     @classmethod
-    def parse(cls, raw: str) -> "_WebVTTFile":
+    def parse(cls, raw: str) -> "WebVTTFile":
+        """Parse a WebVTT file.
+
+        Args:
+            raw: The raw WebVTT file content.
+
+        Returns:
+            The parsed WebVTT file.
+        """
         # Normalize newlines to LF
         raw = raw.replace("\r\n", "\n").replace("\r", "\n")
 
@@ -490,20 +523,23 @@ def parse(cls, raw: str) -> "_WebVTTFile":
 
         # Split into cue blocks
         raw_blocks = re.split(r"\n\s*\n", body.strip())
-        cues: list[_WebVTTCueBlock] = []
+        cues: list[WebVTTCueBlock] = []
         for block in raw_blocks:
             try:
-                cues.append(_WebVTTCueBlock.parse(block))
+                cues.append(WebVTTCueBlock.parse(block))
             except ValueError as e:
                 _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
 
         return cls(cue_blocks=cues)
 
     def __iter__(self):
+        """Return an iterator over the cue blocks."""
         return iter(self.cue_blocks)
 
     def __getitem__(self, idx):
+        """Return the cue block at the given index."""
         return self.cue_blocks[idx]
 
     def __len__(self):
+        """Return the number of cue blocks."""
         return len(self.cue_blocks)
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index f4013831..9e47f1a8 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -9,18 +9,18 @@
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    WebVTTCueBlock,
+    WebVTTCueComponentWithTerminator,
+    WebVTTCueInternalText,
+    WebVTTCueItalicSpan,
+    WebVTTCueLanguageSpan,
+    WebVTTCueLanguageSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTextSpan,
+    WebVTTCueTimings,
+    WebVTTCueVoiceSpan,
+    WebVTTFile,
     WebVTTTimestamp,
-    _WebVTTCueBlock,
-    _WebVTTCueComponentWithTerminator,
-    _WebVTTCueInternalText,
-    _WebVTTCueItalicSpan,
-    _WebVTTCueLanguageSpan,
-    _WebVTTCueLanguageSpanStartTag,
-    _WebVTTCueSpanStartTagAnnotated,
-    _WebVTTCueTextSpan,
-    _WebVTTCueTimings,
-    _WebVTTCueVoiceSpan,
-    _WebVTTFile,
 )
 
 from .test_data_gen_flag import GEN_TEST_DATA
@@ -67,7 +67,7 @@ def test_vtt_cue_commponents() -> None:
     """Test valid cue timings."""
     start = WebVTTTimestamp(raw="00:10.005")
     end = WebVTTTimestamp(raw="00:14.007")
-    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    cue_timings = WebVTTCueTimings(start=start, end=end)
     assert cue_timings.start == start
     assert cue_timings.end == end
     assert str(cue_timings) == "00:10.005 --> 00:14.007"
@@ -76,92 +76,92 @@ def test_vtt_cue_commponents() -> None:
     start = WebVTTTimestamp(raw="00:10.700")
     end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start, end=end)
+        WebVTTCueTimings(start=start, end=end)
     assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
 
     """Test invalid cue timings with missing end."""
     start = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
+        WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
     end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
+        WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test with valid text."""
     valid_text = "This is a valid cue text span."
-    span = _WebVTTCueTextSpan(text=valid_text)
+    span = WebVTTCueTextSpan(text=valid_text)
     assert span.text == valid_text
     assert str(span) == valid_text
 
     """Test with text containing newline characters."""
     invalid_text = "This cue text span\ncontains a newline."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
 
     """Test with text containing ampersand."""
     invalid_text = "This cue text span contains &."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
     invalid_text = "An invalid &foo; entity"
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
     valid_text = "My favorite book is Pride &amp; Prejudice"
-    span = _WebVTTCueTextSpan(text=valid_text)
+    span = WebVTTCueTextSpan(text=valid_text)
     assert span.text == valid_text
 
     """Test with text containing less-than sign."""
     invalid_text = "This cue text span contains <."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
 
     """Test with empty text."""
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text="")
+        WebVTTCueTextSpan(text="")
 
     """Test that annotation validation works correctly."""
     valid_annotation = "valid-annotation"
     invalid_annotation = "invalid\nannotation"
     with pytest.raises(ValidationError):
-        _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
-    assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
+        WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
+    assert WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
 
     """Test that classes validation works correctly."""
     annotation = "speaker name"
     valid_classes = ["class1", "class2"]
     invalid_classes = ["class\nwith\nnewlines", ""]
     with pytest.raises(ValidationError):
-        _WebVTTCueSpanStartTagAnnotated(
+        WebVTTCueSpanStartTagAnnotated(
             name="v", annotation=annotation, classes=invalid_classes
         )
-    assert _WebVTTCueSpanStartTagAnnotated(
+    assert WebVTTCueSpanStartTagAnnotated(
         name="v", annotation=annotation, classes=valid_classes
     )
 
     """Test that components validation works correctly."""
     annotation = "speaker name"
     valid_components = [
-        _WebVTTCueComponentWithTerminator(
-            component=_WebVTTCueTextSpan(text="random text")
+        WebVTTCueComponentWithTerminator(
+            component=WebVTTCueTextSpan(text="random text")
         )
     ]
     invalid_components = [123, "not a component"]
     with pytest.raises(ValidationError):
-        _WebVTTCueInternalText(components=invalid_components)
-    assert _WebVTTCueInternalText(components=valid_components)
+        WebVTTCueInternalText(components=invalid_components)
+    assert WebVTTCueInternalText(components=valid_components)
 
     """Test valid cue voice spans."""
-    cue_span = _WebVTTCueVoiceSpan(
-        start_tag=_WebVTTCueSpanStartTagAnnotated(
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(
             name="v", annotation="speaker", classes=["loud", "clear"]
         ),
-        internal_text=_WebVTTCueInternalText(
+        internal_text=WebVTTCueInternalText(
             components=[
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text="random text")
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
                 )
             ]
         ),
@@ -169,12 +169,12 @@ def test_vtt_cue_commponents() -> None:
     expected_str = "<v.loud.clear speaker>random text</v>"
     assert str(cue_span) == expected_str
 
-    cue_span = _WebVTTCueVoiceSpan(
-        start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
-        internal_text=_WebVTTCueInternalText(
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
+        internal_text=WebVTTCueInternalText(
             components=[
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text="random text")
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
                 )
             ]
         ),
@@ -188,11 +188,11 @@ def test_webvttcueblock_parse() -> None:
     raw: str = (
         "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n"
     )
-    block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw)
+    block: WebVTTCueBlock = WebVTTCueBlock.parse(raw)
     assert str(block.timings) == "04:02.500 --> 04:05.000"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert (
         block.payload[0].component.text
         == "J’ai commencé le basket à l'âge de 13, 14 ans"
@@ -203,23 +203,23 @@ def test_webvttcueblock_parse() -> None:
         "04:05.001 --> 04:07.800\n"
         "Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier\n"
     )
-    block = _WebVTTCueBlock.parse(raw)
+    block = WebVTTCueBlock.parse(raw)
     assert str(block.timings) == "04:05.001 --> 04:07.800"
     assert len(block.payload) == 3
-    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Sur les "
-    assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan)
+    assert isinstance(block.payload[1], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[1].component, WebVTTCueItalicSpan)
     assert len(block.payload[1].component.internal_text.components) == 1
     lang_span = block.payload[1].component.internal_text.components[0].component
-    assert isinstance(lang_span, _WebVTTCueLanguageSpan)
+    assert isinstance(lang_span, WebVTTCueLanguageSpan)
     assert isinstance(
-        lang_span.internal_text.components[0].component, _WebVTTCueTextSpan
+        lang_span.internal_text.components[0].component, WebVTTCueTextSpan
     )
     assert lang_span.internal_text.components[0].component.text == "playground"
-    assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[2].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[2], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[2].component, WebVTTCueTextSpan)
     assert block.payload[2].component.text == ", ici à Montpellier"
     assert raw == str(block)
 
@@ -228,26 +228,26 @@ def test_webvtt_file() -> None:
     """Test WebVTT files."""
     with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 13
     block = vtt.cue_blocks[11]
     assert str(block.timings) == "00:32.500 --> 00:33.500"
     assert len(block.payload) == 1
     cue_span = block.payload[0]
-    assert isinstance(cue_span.component, _WebVTTCueVoiceSpan)
+    assert isinstance(cue_span.component, WebVTTCueVoiceSpan)
     assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson"
     assert not cue_span.component.start_tag.classes
     assert len(cue_span.component.internal_text.components) == 1
     comp = cue_span.component.internal_text.components[0]
-    assert isinstance(comp.component, _WebVTTCueItalicSpan)
+    assert isinstance(comp.component, WebVTTCueItalicSpan)
     assert len(comp.component.internal_text.components) == 1
     comp2 = comp.component.internal_text.components[0]
-    assert isinstance(comp2.component, _WebVTTCueTextSpan)
+    assert isinstance(comp2.component, WebVTTCueTextSpan)
     assert comp2.component.text == "Laughs"
 
     with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 4
     reverse = (
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
@@ -258,7 +258,7 @@ def test_webvtt_file() -> None:
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 13
     for block in vtt:
         assert block.identifier
@@ -266,20 +266,20 @@ def test_webvtt_file() -> None:
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
     assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan)
+    assert isinstance(block.payload[0].component, WebVTTCueVoiceSpan)
     block = vtt.cue_blocks[2]
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
     assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
 
 
 def test_webvtt_cue_language_span_start_tag():
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
     with pytest.raises(ValidationError, match="BCP 47"):
-        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
     with pytest.raises(ValidationError, match="BCP 47"):
-        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')

From 82e80c0c2a6643dd5a17457a65f8736d86207cca Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 17:05:16 +0100
Subject: [PATCH 09/20] chore(webvtt): preserve newlines as
 WebVTTLineTerminator

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 38 ++++++++++++++++++--------
 test/data/webvtt/webvtt_example_04.vtt | 13 +++++++++
 test/test_webvtt.py                    | 13 +++++++++
 3 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 test/data/webvtt/webvtt_example_04.vtt

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 550498a9..023b0192 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -2,6 +2,7 @@
 
 import logging
 import re
+from collections.abc import Iterator
 from enum import Enum
 from typing import Annotated, ClassVar, Literal, Optional, Union
 
@@ -17,13 +18,15 @@
 _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
 
 
-class _WebVTTLineTerminator(str, Enum):
+class WebVTTLineTerminator(str, Enum):
+    """WebVTT line terminator."""
+
     CRLF = "\r\n"
     LF = "\n"
     CR = "\r"
 
 
-_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
+WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
 class WebVTTTimestamp(BaseModel):
@@ -137,7 +140,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
     component: "WebVTTCueComponent"
-    terminator: Optional[_WebVTTLineTerminator] = None
+    terminator: Optional[WebVTTLineTerminator] = None
 
     @override
     def __str__(self):
@@ -148,7 +151,7 @@ def __str__(self):
 class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
-    terminator: Optional[_WebVTTLineTerminator] = None
+    terminator: Optional[WebVTTLineTerminator] = None
     components: Annotated[
         list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
@@ -339,7 +342,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
+    identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
@@ -363,6 +366,19 @@ def validate_payload(cls, payload):
                 raise ValueError("Cue payload must not contain '-->'")
         return payload
 
+    @staticmethod
+    def _create_text_components(
+        text: str,
+    ) -> Iterator[WebVTTCueComponentWithTerminator]:
+        text_list = text.split("\n")
+        for idx, line in enumerate(text.split("\n")):
+            terminator = WebVTTLineTerminator.LF if idx < len(text_list) - 1 or text.endswith("\n") else None
+            if len(line) > 0:
+                yield WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text=line),
+                    terminator=terminator,
+                )
+
     @classmethod
     def parse(cls, raw: str) -> "WebVTTCueBlock":
         """Parse a WebVTT cue block from a string.
@@ -376,7 +392,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
-        identifier: Optional[_WebVTTCueIdentifier] = None
+        identifier: Optional[WebVTTCueIdentifier] = None
         timing_line = lines[0]
         if "-->" not in timing_line and len(lines) > 1:
             identifier = timing_line
@@ -391,7 +407,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
         timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
-        cue_text = " ".join(cue_lines).strip()
+        cue_text = "\n".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
             if cue_text.startswith(f"<{omm}") and f"</{omm}>" not in cue_text:
@@ -407,9 +423,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         while i < len(matches):
             match = matches[i]
             if match.start() > pos:
-                stack[-1].append(
-                    WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-                )
+                text = cue_text[pos : match.start()]
+                stack[-1].extend(cls._create_text_components(text))
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
             if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}:
@@ -454,7 +469,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:])))
+            text = cue_text[pos:]
+            stack[-1].extend(cls._create_text_components(text))
 
         return cls(
             identifier=identifier,
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 00000000..91be3530
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,13 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:01.000 --> 00:04.000
+Never drink liquid nitrogen.
+
+NOTE I’m not sure the timing is right on the following cue.
+
+00:05.000 --> 00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 9e47f1a8..1bf9edb8 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -274,6 +274,19 @@ def test_webvtt_file() -> None:
     assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
 
+    with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 2
+    block = vtt.cue_blocks[1]
+    assert len(block.payload) == 5
+    assert str(block) == (
+        "00:05.000 --> 00:09.000\n"
+        "— It will perforate your stomach.\n"
+        "— You could <b.loud>die</b>.\n"
+        "<v John>This is true.</v>\n"
+    )
+
 
 def test_webvtt_cue_language_span_start_tag():
     WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')

From 5721f099345ca5e014074140888227e6573a3609 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Sun, 14 Dec 2025 23:52:36 +0100
Subject: [PATCH 10/20] refactor(webvtt): set ProvenanceTrack time fields as
 float

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 20 ++++++++++-----
 docling_core/types/doc/webvtt.py   |  2 +-
 docs/DoclingDocument.json          | 41 ++++++++++--------------------
 test/test_doc_base.py              | 23 ++++++++++-------
 4 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 6f02c54f..7f088389 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -67,7 +67,6 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
-from docling_core.types.doc.webvtt import WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1178,17 +1177,17 @@ class ProvenanceTrack(BaseModel):
     """
 
     start_time: Annotated[
-        WebVTTTimestamp,
+        float,
         Field(
-            examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
-            description="Start time offset of the track cue",
+            examples=[11.0, 6.5, 5370.0],
+            description="Start time offset of the track cue in seconds",
         ),
     ]
     end_time: Annotated[
-        WebVTTTimestamp,
+        float,
         Field(
-            examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
-            description="End time offset of the track cue",
+            examples=[12.0, 8.2, 5370.1],
+            description="End time offset of the track cue in seconds",
         ),
     ]
     identifier: Optional[str] = Field(
@@ -1213,6 +1212,13 @@ class ProvenanceTrack(BaseModel):
         description="Classes for describing the cue significance",
     )
 
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        """Ensure start time is less than the end time."""
+        if self.end_time <= self.start_time:
+            raise ValueError("End time must be greater than start time")
+        return self
+
 
 def get_provenance_discriminator_value(v: Any) -> str:
     """Callable discriminator for provenance instances.
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 023b0192..30fa1a4f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -95,7 +95,7 @@ class WebVTTCueTimings(BaseModel):
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
-        """Ensure start timestamp is less than or equal to end timestamp."""
+        """Ensure start timestamp is less than end timestamp."""
         if self.start and self.end:
             if self.end.seconds <= self.start.seconds:
                 raise ValueError("End timestamp must be greater than start timestamp")
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 35175601..45a5d889 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2228,22 +2228,24 @@
       "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
       "properties": {
         "start_time": {
-          "$ref": "#/$defs/WebVTTTimestamp",
-          "description": "Start time offset of the track cue",
+          "description": "Start time offset of the track cue in seconds",
           "examples": [
-            "00.11.000",
-            "00:00:06.500",
-            "01:28:34.300"
-          ]
+            11.0,
+            6.5,
+            5370.0
+          ],
+          "title": "Start Time",
+          "type": "number"
         },
         "end_time": {
-          "$ref": "#/$defs/WebVTTTimestamp",
-          "description": "End time offset of the track cue",
+          "description": "End time offset of the track cue in seconds",
           "examples": [
-            "00.12.000",
-            "00:00:08.200",
-            "01:29:30.100"
-          ]
+            12.0,
+            8.2,
+            5370.1
+          ],
+          "title": "End Time",
+          "type": "number"
         },
         "identifier": {
           "anyOf": [
@@ -3190,21 +3192,6 @@
       ],
       "title": "TitleItem",
       "type": "object"
-    },
-    "WebVTTTimestamp": {
-      "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
-      "properties": {
-        "raw": {
-          "description": "A representation of the WebVTT Timestamp as a single string",
-          "title": "Raw",
-          "type": "string"
-        }
-      },
-      "required": [
-        "raw"
-      ],
-      "title": "WebVTTTimestamp",
-      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 18d2cf11..2d1ce498 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,8 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc.document import ProvenanceTrack
-from docling_core.types.doc.webvtt import WebVTTTimestamp
+from docling_core.types.doc import ProvenanceTrack
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -45,8 +44,8 @@ def test_prov_track():
     """Test the class ProvenanceTrack."""
 
     valid_track = ProvenanceTrack(
-        start_time=WebVTTTimestamp(raw="00:11.000"),
-        end_time=WebVTTTimestamp(raw="00:12.000"),
+        start_time=11.0,
+        end_time=12.0,
         identifier="test",
         voice="Mary",
         languages=["en", "en-GB"],
@@ -54,19 +53,25 @@ def test_prov_track():
     )
 
     assert valid_track
-    assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000")
-    assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000")
+    assert valid_track.start_time == 11.0
+    assert valid_track.end_time == 12.0
     assert valid_track.identifier == "test"
     assert valid_track.voice == "Mary"
     assert valid_track.languages == ["en", "en-GB"]
     assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
-        ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000"))
+        ProvenanceTrack(start_time=11.0)
 
     with pytest.raises(ValidationError, match="should be a valid list"):
         ProvenanceTrack(
-            start_time=WebVTTTimestamp(raw="00:11.000"),
-            end_time=WebVTTTimestamp(raw="00:12.000"),
+            start_time=11.0,
+            end_time=12.0,
             languages="en",
         )
+
+    with pytest.raises(ValidationError, match="must be greater than start"):
+        ProvenanceTrack(
+            start_time=11.0,
+            end_time=11.0,
+        )

From 134cf959d3b742096ca26ba679464242b0378435 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 15 Dec 2025 22:13:16 +0100
Subject: [PATCH 11/20] chore(webvtt): ensure start time offsets are in
 sequence

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 34 +++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 30fa1a4f..bf5b7227 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -4,6 +4,7 @@
 import re
 from collections.abc import Iterator
 from enum import Enum
+from functools import total_ordering
 from typing import Annotated, ClassVar, Literal, Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
@@ -29,6 +30,7 @@ class WebVTTLineTerminator(str, Enum):
 WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
+@total_ordering
 class WebVTTTimestamp(BaseModel):
     """WebVTT timestamp.
 
@@ -81,6 +83,18 @@ def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
         return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
+    def __eq__(self, other: object) -> bool:
+        """Two timestamps are equal if their total number of seconds is equal."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds == other.seconds
+
+    def __lt__(self, other: "WebVTTTimestamp") -> bool:
+        """Return True if this timestamp occurs before `other`."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds < other.seconds
+
     @override
     def __str__(self) -> str:
         """Return a string representation of a WebVTT timestamp."""
@@ -97,7 +111,7 @@ class WebVTTCueTimings(BaseModel):
     def check_order(self) -> Self:
         """Ensure start timestamp is less than end timestamp."""
         if self.start and self.end:
-            if self.end.seconds <= self.start.seconds:
+            if self.end <= self.start:
                 raise ValueError("End timestamp must be greater than start timestamp")
         return self
 
@@ -512,6 +526,24 @@ def verify_signature(content: str) -> bool:
         else:
             return False
 
+    @model_validator(mode="after")
+    def validate_start_time(self) -> Self:
+        """Validate cue start times.
+
+        The start time offset of the cue must be greater than or equal to the start
+        time offsets of all previous cues.
+        """
+        idx: int = 0
+        while idx < (len(self.cue_blocks) - 1):
+            if self.cue_blocks[idx + 1].timings.start < self.cue_blocks[idx].timings.start:
+                raise ValueError(
+                    f"The start time offset of block {idx + 1} must be greater than or"
+                    " equal to the start time offsets of all previous cues in the file"
+                )
+            idx += 1
+
+        return self
+
     @classmethod
     def parse(cls, raw: str) -> "WebVTTFile":
         """Parse a WebVTT file.

From 3983b4456b1c72f7bc59aacf67c96393363e040f Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 16 Dec 2025 17:04:20 +0100
Subject: [PATCH 12/20] chore(webvtt): improve regex to remove
 note,region,style blocks

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 11 ++++-------
 test/data/webvtt/webvtt_example_04.vtt | 20 ++++++++++++++++++++
 test/test_webvtt.py                    |  6 +++++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index bf5b7227..c4f7336f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -1,7 +1,7 @@
 """Models for the Docling's adoption of Web Video Text Tracks format."""
 
-import logging
 import re
+import warnings
 from collections.abc import Iterator
 from enum import Enum
 from functools import total_ordering
@@ -11,9 +11,6 @@
 from pydantic.types import StringConstraints
 from typing_extensions import Self, override
 
-_log = logging.getLogger(__name__)
-
-
 _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
 _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
 _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
@@ -512,6 +509,7 @@ def __str__(self):
 class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
+    _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
     cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
@@ -566,8 +564,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
         body = lines[1] if len(lines) > 1 else ""
 
         # Remove NOTE/STYLE/REGION blocks
-        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
-        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+        body = re.sub(cls._pattern, "", body)
 
         # Split into cue blocks
         raw_blocks = re.split(r"\n\s*\n", body.strip())
@@ -576,7 +573,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
             try:
                 cues.append(WebVTTCueBlock.parse(block))
             except ValueError as e:
-                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+                warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning)
 
         return cls(cue_blocks=cues)
 
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
index 91be3530..b0519be2 100644
--- a/test/data/webvtt/webvtt_example_04.vtt
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -2,6 +2,26 @@ WEBVTT
 
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 
+STYLE
+::cue {
+    background-image: linear-gradient(to bottom, dimgray, lightgray);
+    color: papayawhip;
+}
+/* Style blocks cannot use blank lines nor "dash dash greater than" */
+
+REGION
+id:editor-comments
+width: 40%
+regionanchor:0%,100%
+viewportanchor:10%,90%
+
+REGION
+id:scroll
+width: 40%
+regionanchor:100%,100%
+viewportanchor:90%,90%
+scroll:up
+
 00:01.000 --> 00:04.000
 Never drink liquid nitrogen.
 
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 1bf9edb8..51f448ed 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -5,6 +5,8 @@
 Copyright © 2019 World Wide Web Consortium.
 """
 
+import warnings
+
 import pytest
 from pydantic import ValidationError
 
@@ -276,7 +278,9 @@ def test_webvtt_file() -> None:
 
     with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = WebVTTFile.parse(content)
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            vtt = WebVTTFile.parse(content)
     assert len(vtt) == 2
     block = vtt.cue_blocks[1]
     assert len(block.payload) == 5

From ff30e427a9235ef669a43a14b83eea79f6a9506d Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 16 Dec 2025 17:23:34 +0100
Subject: [PATCH 13/20] chore(webvtt): parse the WebVTT file title

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 4 +++-
 test/data/webvtt/webvtt_example_04.vtt | 2 +-
 test/test_webvtt.py                    | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index c4f7336f..6b4eba1f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -510,6 +510,7 @@ class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
+    title: Optional[str] = None
     cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
@@ -561,6 +562,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
 
         # Strip "WEBVTT" header line
         lines = raw.split("\n", 1)
+        title = lines[0].removeprefix("WEBVTT").strip() or None
         body = lines[1] if len(lines) > 1 else ""
 
         # Remove NOTE/STYLE/REGION blocks
@@ -575,7 +577,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
             except ValueError as e:
                 warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning)
 
-        return cls(cue_blocks=cues)
+        return cls(title=title, cue_blocks=cues)
 
     def __iter__(self):
         """Return an iterator over the cue blocks."""
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
index b0519be2..78b5ba0c 100644
--- a/test/data/webvtt/webvtt_example_04.vtt
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -1,4 +1,4 @@
-WEBVTT
+WEBVTT Danger of Nitrogen
 
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 51f448ed..a3443fd2 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -275,6 +275,7 @@ def test_webvtt_file() -> None:
     assert len(block.payload) == 1
     assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
+    assert not vtt.title
 
     with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -290,6 +291,7 @@ def test_webvtt_file() -> None:
         "— You could <b.loud>die</b>.\n"
         "<v John>This is true.</v>\n"
     )
+    assert vtt.title == "Danger of Nitrogen"
 
 
 def test_webvtt_cue_language_span_start_tag():

From 6da51be58118aaf805579d07f61a80d41f381c45 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 6 Jan 2026 15:47:34 +0100
Subject: [PATCH 14/20] chore(webvtt): rebase to latest changes in idoctags

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/experimental/idoctags.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
index dd19d7f0..7376062b 100644
--- a/docling_core/experimental/idoctags.py
+++ b/docling_core/experimental/idoctags.py
@@ -175,6 +175,8 @@ def _create_location_tokens_for_item(
         return ""
     out: list[str] = []
     for prov in item.prov:
+        if not isinstance(prov, ProvenanceItem):
+            continue
         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
         bbox = prov.bbox.to_top_left_origin(page_h).as_tuple()
         out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres))
@@ -1379,12 +1381,14 @@ def serialize(
             # we will need to do something more complex I believe ...
             res: list[SerializationResult] = []
             for idp, prov_ in enumerate(item.prov):
-                item_ = copy.deepcopy(item)
+                if not isinstance(prov_, ProvenanceItem):
+                    continue
+                item_: TextItem = copy.deepcopy(item)
                 item_.prov = [prov_]
                 item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]]  # it must be `orig`, not `text` here!
                 item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]]
-
-                item_.prov[0].charspan = (0, len(item_.orig))
+                if isinstance(item_.prov[0], ProvenanceItem):
+                    item_.prov[0].charspan = (0, len(item_.orig))
 
                 # marker field should be cleared on subsequent split parts
                 if idp > 0 and isinstance(item_, ListItem):
@@ -1748,7 +1752,7 @@ def _emit_otsl(
 
         if params.add_table_cell_location:
             # Check if we have all required information for location serialization
-            if item.prov and len(item.prov) > 0:
+            if item.prov and isinstance(item.prov[0], ProvenanceItem):
                 page_no = item.prov[0].page_no
                 if doc.pages and page_no in doc.pages:
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
@@ -1897,6 +1901,8 @@ def serialize(
             for it, _ in doc.iterate_items(root=item):
                 if isinstance(it, DocItem) and it.prov:
                     for prov in it.prov:
+                        if not isinstance(prov, ProvenanceItem):
+                            continue
                         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
                         boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple())
                         prov_page_w_h = (page_w, page_h, prov.page_no)

From 0a9e190ccee54d3f55c90c2889f0980f6a5ea2ab Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 19 Jan 2026 18:54:07 +0100
Subject: [PATCH 15/20] feat(webvtt): add WebVTT serializer

Add a DoclingDocument serializer to WebVTT format.
Improve WebVTT data model.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/common.py |   3 +-
 docling_core/transforms/serializer/webvtt.py | 545 +++++++++++++++++++
 docling_core/types/doc/document.py           |   6 +-
 docling_core/types/doc/webvtt.py             | 150 ++++-
 docs/DoclingDocument.json                    |   6 +-
 test/data/doc/webvtt_example_01.gt.vtt       |  40 ++
 test/data/doc/webvtt_example_01.json         | 313 +++++++++++
 test/data/doc/webvtt_example_02.gt.vtt       |  16 +
 test/data/doc/webvtt_example_02.json         | 272 +++++++++
 test/data/doc/webvtt_example_03.gt.vtt       |  57 ++
 test/data/doc/webvtt_example_03.json         | 406 ++++++++++++++
 test/data/doc/webvtt_example_04.gt.vtt       |   9 +
 test/data/doc/webvtt_example_04.json         | 194 +++++++
 test/data/doc/webvtt_example_05.gt.vtt       |  10 +
 test/data/doc/webvtt_example_05.json         | 344 ++++++++++++
 test/test_serialization.py                   |  25 +
 test/test_webvtt.py                          |   7 +-
 17 files changed, 2372 insertions(+), 31 deletions(-)
 create mode 100644 docling_core/transforms/serializer/webvtt.py
 create mode 100644 test/data/doc/webvtt_example_01.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_01.json
 create mode 100644 test/data/doc/webvtt_example_02.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_02.json
 create mode 100644 test/data/doc/webvtt_example_03.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_03.json
 create mode 100644 test/data/doc/webvtt_example_04.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_04.json
 create mode 100644 test/data/doc/webvtt_example_05.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_05.json

diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index c9c497f4..c36062e0 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -324,7 +324,7 @@ def serialize_doc(
         parts: list[SerializationResult],
         **kwargs: Any,
     ) -> SerializationResult:
-        """Serialize a document out of its pages."""
+        """Serialize a document out of its parts."""
         ...
 
     def _serialize_body(self, **kwargs) -> SerializationResult:
@@ -355,7 +355,6 @@ def serialize(
         empty_res = create_ser_result()
 
         my_item = item or self.doc.body
-
         if my_item == self.doc.body:
             if my_item.meta and not self._meta_is_wrapped():
                 meta_part = self.serialize_meta(item=my_item, **my_kwargs)
diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
new file mode 100644
index 00000000..15fdbc3b
--- /dev/null
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -0,0 +1,545 @@
+"""Define classes for WebVTT serialization."""
+
+import logging
+import re
+from typing import Any, Optional, get_args
+
+from pydantic import BaseModel
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
+    BaseDocSerializer,
+    BaseFallbackSerializer,
+    BaseFormSerializer,
+    BaseInlineSerializer,
+    BaseKeyValueSerializer,
+    BaseListSerializer,
+    BaseMetaSerializer,
+    BasePictureSerializer,
+    BaseTableSerializer,
+    BaseTextSerializer,
+    SerializationResult,
+)
+from docling_core.transforms.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    create_ser_result,
+)
+from docling_core.types.doc.document import (
+    ContentLayer,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    Formatting,
+    FormItem,
+    InlineGroup,
+    KeyValueItem,
+    ListGroup,
+    NodeItem,
+    PictureItem,
+    ProvenanceTrack,
+    TableItem,
+    TextItem,
+    TitleItem,
+)
+from docling_core.types.doc.webvtt import (
+    START_TAG_NAMES,
+    WebVTTCueBlock,
+    WebVTTCueSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTimings,
+    WebVTTFile,
+    WebVTTLineTerminator,
+    WebVTTTimestamp,
+)
+
+_logger = logging.getLogger(__name__)
+
+
+def _remove_consecutive_pairs(text: str) -> str:
+    """Remove one pass of consecutive start/end tag pairs.
+
+    This function looks for patterns like </tag><tag> where the tags are identical
+    and removes them. It handles two cases:
+    1. Direct adjacent tags with content: <tag>content</tag>whitespace<tag>
+    2. Tags with other tags in between: </tag><othertag><tag>
+
+    Args:
+        text: Input string
+
+    Returns:
+        String with one pass of consecutive pairs removed
+    """
+    # Pattern 1: Direct adjacent tags </tag><tag> with same classes and annotations
+    pattern1 = re.compile(
+        r"<([bciuv]|lang)((?:\.\w+)*)(?:\s+([^>]+))?>"  # Opening tag: capture tag, classes, annotation
+        r"((?:(?!</\1>).)*?)"  # Content (non-greedy, not containing the closing tag)
+        r"</\1>"  # Closing tag
+        r"(\s*)"  # Capture whitespace between tags (including newlines)
+        r"<\1((?:\.\w+)*)(?:\s+([^>]+))?>"  # Next opening tag: capture classes and annotation
+    )
+
+    def replacer1(match: re.Match[str]) -> str:
+        tag = match.group(1)
+        classes1 = match.group(2) or ""
+        anno1 = match.group(3) or ""
+        content = match.group(4)
+        whitespace = match.group(5)  # Whitespace between tags
+        classes2 = match.group(6) or ""
+        anno2 = match.group(7) or ""
+
+        # Only merge if classes and annotations match
+        if classes1 == classes2 and anno1 == anno2:
+            # Merge: remove the closing and opening tags, but keep the whitespace
+            return f"<{tag}{classes1}{' ' + anno1 if anno1 else ''}>{content}{whitespace}"
+        else:
+            # Don't merge - return original
+            return match.group(0)
+
+    # Pattern 2: Tags with other tags in between </tag><othertag><tag>
+    # This removes redundant </tag> and <tag> when there's another tag in between
+    pattern2 = re.compile(
+        r"</([bciuv]|lang)>"  # Closing tag
+        r"(<[^>]+>)"  # Any other tag in between
+        r"<\1(?:\.\w+)*(?:\s+[^>]+)?>"  # Same opening tag (with any classes/annotations)
+    )
+
+    def replacer2(match: re.Match[str]) -> str:
+        # Just keep the middle tag, remove the closing and opening of the same type
+        return match.group(2)
+
+    result = pattern1.sub(replacer1, text)
+    result = pattern2.sub(replacer2, result)
+
+    return result
+
+
+class WebVTTParams(CommonParams):
+    """Serialization parameters for the Web Video Text Tracks (WebVTT) format."""
+
+    layers: set[ContentLayer] = {ContentLayer.BODY}
+
+
+class WebVTTTextSerializer(BaseModel, BaseTextSerializer):
+    """Text serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TextItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        is_inline_scope: bool = False,
+        visited: Optional[set[str]] = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        # Handle TitleItem specially - it doesn't have provenance but we need its text
+        if isinstance(item, TitleItem):
+            return create_ser_result(text=item.text, span_source=item)
+
+        # Only process items with ProvenanceTrack (WebVTT cues)
+        if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack):
+            return create_ser_result()
+
+        # Apply post-processing here: formatting, classes, language, and voice
+        # If the TextItem is part of an InlineGroup, we need to further post-process it
+        # within the group context
+
+        prov: ProvenanceTrack = item.prov[0]
+        text: str = doc_serializer.post_process(
+            text=item.text,
+            formatting=item.formatting,
+            voice=prov.voice,
+            languages=prov.languages,
+            classes=prov.classes,
+        )
+        if is_inline_scope:
+            # Iteratively remove unnecessary consecutive tag pairs until no more changes
+            prev_text: Optional[str] = None
+            while prev_text != text:
+                prev_text = text
+                text = _remove_consecutive_pairs(text)
+
+        return create_ser_result(text=text, span_source=item)
+
+
+class _WebVTTTableSerializer(BaseTableSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTPictureSerializer(BasePictureSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: PictureItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTKeyValueSerializer(BaseKeyValueSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFormSerializer(BaseFormSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: FormItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFallbackSerializer(BaseFallbackSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTListSerializer(BaseModel, BaseListSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: ListGroup,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        list_level: int = 0,
+        is_inline_scope: bool = False,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs)
+        return create_ser_result()
+
+
+class WebVTTInlineSerializer(BaseInlineSerializer):
+    """Inline group serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: InlineGroup,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        list_level: int = 0,
+        visited: Optional[set[str]] = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes an inline group to WebVTT format."""
+        _ = doc
+        my_visited = visited if visited is not None else set()
+        parts = doc_serializer.get_parts(
+            item=item,
+            list_level=list_level,
+            is_inline_scope=True,
+            visited=my_visited,
+            **kwargs,
+        )
+        # Include all parts, even if text is empty or whitespace-only
+        # Use 'is not None' instead of truthiness check to preserve whitespace
+        text_res = "".join([p.text for p in parts if p.text is not None])
+
+        # Apply tag normalization to the concatenated result
+        # Iteratively remove consecutive pairs until no more changes
+        prev_text = None
+        while prev_text != text_res:
+            prev_text = text_res
+            text_res = _remove_consecutive_pairs(text_res)
+
+        return create_ser_result(text=text_res, span_source=parts)
+
+
+class _WebVTTMetaSerializer(BaseModel, BaseMetaSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class WebVTTDocSerializer(DocSerializer):
+    """Document serializer to Web Video Text Tracks (WebVTT) format."""
+
+    text_serializer: BaseTextSerializer = WebVTTTextSerializer()
+    table_serializer: BaseTableSerializer = _WebVTTTableSerializer()
+    picture_serializer: BasePictureSerializer = _WebVTTPictureSerializer()
+    key_value_serializer: BaseKeyValueSerializer = _WebVTTKeyValueSerializer()
+    form_serializer: BaseFormSerializer = _WebVTTFormSerializer()
+    fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer()
+    list_serializer: BaseListSerializer = _WebVTTListSerializer()
+    inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer()
+    meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer()
+    annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer()
+
+    params: CommonParams = CommonParams()
+
+    @override
+    def requires_page_break(self) -> bool:
+        """Whether to add page breaks.
+
+        WebVTT format does not support page breaks.
+        """
+        return False
+
+    @override
+    def serialize_bold(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific bold serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("b", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="b",
+            css=classes,
+        )
+
+    @override
+    def serialize_italic(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific italic serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("i", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="i",
+            css=classes,
+        )
+
+    @override
+    def serialize_underline(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific underline serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("u", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="u",
+            css=classes,
+        )
+
+    def serialize_cue_span(
+        self,
+        text: str,
+        tag: START_TAG_NAMES,
+        anno: Optional[str] = None,
+        css: list[str] = [],
+    ) -> str:
+        """Apply serialization to a WebVTT cue span."""
+        start_tag: WebVTTCueSpanStartTag
+        if tag in {"b", "i", "u", "c"}:
+            start_tag = WebVTTCueSpanStartTag(name=tag, classes=css)
+        elif tag in {"v", "lang"}:
+            if not anno:
+                _logger.warning(f"Invalid {tag} cue span without annotation: {text}")
+                return text
+            else:
+                start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno)
+        else:
+            return text
+
+        res: str = f"{start_tag}{text}</{tag}>"
+        return res
+
+    @staticmethod
+    def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
+        """Extract tag and values from provenance classes.
+
+        Args:
+            classes: The classes from a ProvenanceTrack object.
+
+        Returns:
+            Map of tag to class values.
+        """
+        res: dict[str, list[str]] = {}
+        for item in classes or []:
+            for prefix in get_args(START_TAG_NAMES):
+                if item == prefix:
+                    res[prefix] = []
+                    break
+                elif item.startswith(prefix + "."):
+                    cls_str: str = item[len(prefix) + 1 :]
+                    res[prefix] = cls_str.split(".")
+                    break
+        return res
+
+    @override
+    def serialize_doc(
+        self,
+        *,
+        parts: list[SerializationResult],
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize a document out of its parts."""
+        title: Optional[str] = None
+
+        timings: Optional[WebVTTCueTimings] = None
+        id: Optional[str] = None
+        text: str = ""
+        cue_blocks: list[WebVTTCueBlock] = []
+        for part in parts:
+            if not part.text or not part.spans:
+                continue
+
+            # Get the doc item from the first span
+            doc_item: DocItem = part.spans[0].item
+
+            # Handle title items (check both TitleItem type and label)
+            if isinstance(doc_item, TitleItem) or (
+                isinstance(doc_item, TextItem) and doc_item.label == DocItemLabel.TITLE
+            ):
+                title = part.text
+                continue
+            if isinstance(doc_item, InlineGroup) and doc_item.children:
+                doc_item = doc_item.children[0].resolve(doc=self.doc)
+            if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack):
+                prov: ProvenanceTrack = doc_item.prov[0]
+                if (
+                    prov.identifier == id
+                    and timings
+                    and timings.start.seconds == prov.start_time
+                    and timings.end.seconds == prov.end_time
+                ):
+                    # When combining items with same timing, add newline and merge consecutive tags
+                    combined = text.rstrip() + WebVTTLineTerminator.LF.value + part.text
+                    # Use _remove_consecutive_pairs to merge tags like </v>\n<v Speaker A>
+                    # Iteratively remove consecutive pairs until no more changes
+                    prev_combined = None
+                    while prev_combined != combined:
+                        prev_combined = combined
+                        combined = _remove_consecutive_pairs(combined)
+                    text = combined + WebVTTLineTerminator.LF.value
+                else:
+                    if text:
+                        cue_blocks.append(WebVTTCueBlock.parse(text))
+                    timings = WebVTTCueTimings(
+                        start=WebVTTTimestamp.from_seconds(prov.start_time),
+                        end=WebVTTTimestamp.from_seconds(prov.end_time),
+                    )
+                    id = prov.identifier
+                    text = (
+                        f"{id + WebVTTLineTerminator.LF.value if id else ''}{timings}"
+                        f"{WebVTTLineTerminator.LF.value}{part.text}"
+                        f"{WebVTTLineTerminator.LF.value}"
+                    )
+        if text:
+            cue_blocks.append(WebVTTCueBlock.parse(text))
+
+        webvtt_file = WebVTTFile(title=title, cue_blocks=cue_blocks)
+        content = str(webvtt_file)
+        return create_ser_result(text=content, span_source=parts)
+
+    def post_process(
+        self,
+        text: str,
+        formatting: Optional[Formatting] = None,
+        voice: Optional[str] = None,
+        languages: Optional[list[str]] = None,
+        classes: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Apply some text post-processing steps by adding formatting tags.
+
+        The order of the formatting tags is determined by this function and `DocSerializer.post_process`,
+        from the innermost to the outermost:
+            1. language (<lang>)
+            2. underline (<u>)
+            3. italic (<i>)
+            4. bold (<b>)
+            5. class (<c>)
+            6. voice (<v>)
+        """
+        res: str = text
+        cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
+
+        for lang in languages or []:
+            res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", []))
+
+        res = super().post_process(text=res, formatting=formatting, classes=cls)
+
+        if "c" in cls:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="c",
+                css=cls.get("c", []),
+            )
+        if voice:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="v",
+                anno=voice,
+                css=cls.get("v", []),
+            )
+
+        return res
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 7f088389..dc0dbbf2 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1208,7 +1208,7 @@ class ProvenanceTrack(BaseModel):
     classes: Optional[list[str]] = Field(
         None,
         min_length=1,
-        examples=["first", "loud", "yellow"],
+        examples=["b.first", "v.loud", "c.yellow"],
         description="Classes for describing the cue significance",
     )
 
@@ -1220,7 +1220,7 @@ def check_order(self) -> Self:
         return self
 
 
-def get_provenance_discriminator_value(v: Any) -> str:
+def _get_provenance_discriminator_value(v: Any) -> str:
     """Callable discriminator for provenance instances.
 
     Args:
@@ -1237,7 +1237,7 @@ def get_provenance_discriminator_value(v: Any) -> str:
 
 ProvenanceType = Annotated[
     Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
-    Discriminator(get_provenance_discriminator_value),
+    Discriminator(_get_provenance_discriminator_value),
 ]
 
 
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6b4eba1f..6bc4a219 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -13,7 +13,7 @@
 
 _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
 _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
-_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
+START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
 
 
 class WebVTTLineTerminator(str, Enum):
@@ -80,6 +80,23 @@ def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
         return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
+    @classmethod
+    def from_seconds(cls, seconds: float) -> Self:
+        """Create a WebVTT timestamp from seconds.
+
+        Args:
+            seconds: The time in seconds (can include fractional seconds for milliseconds).
+
+        Returns:
+            A WebVTT timestamp instance.
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis: int = round((seconds % 1) * 1000)
+
+        return cls(raw=f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}")
+
     def __eq__(self, other: object) -> bool:
         """Two timestamps are equal if their total number of seconds is equal."""
         if not isinstance(other, WebVTTTimestamp):
@@ -92,9 +109,27 @@ def __lt__(self, other: "WebVTTTimestamp") -> bool:
             return NotImplemented
         return self.seconds < other.seconds
 
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the timestamp as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0.
+
+        Returns:
+            Formatted timestamp string.
+        """
+        if omit_hours_if_zero and self._hours == 0:
+            return f"{self._minutes:02d}:{self._seconds:02d}.{self._millis:03d}"
+        return self.raw
+
     @override
     def __str__(self) -> str:
-        """Return a string representation of a WebVTT timestamp."""
+        """Return a string representation of a WebVTT timestamp.
+
+        Always returns the full timestamp format including hours (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation (MM:SS.mmm) when hours are zero.
+        """
         return self.raw
 
 
@@ -112,9 +147,27 @@ def check_order(self) -> Self:
                 raise ValueError("End timestamp must be greater than start timestamp")
         return self
 
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the cue timings as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in both timestamps.
+
+        Returns:
+            Formatted cue timings string in the format "start --> end".
+        """
+        start_str = self.start.format(omit_hours_if_zero=omit_hours_if_zero)
+        end_str = self.end.format(omit_hours_if_zero=omit_hours_if_zero)
+        return f"{start_str} --> {end_str}"
+
     @override
-    def __str__(self):
-        """Return a string representation of the cue timings."""
+    def __str__(self) -> str:
+        """Return a string representation of the cue timings.
+
+        Always returns the full format including hours (HH:MM:SS.mmm --> HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
         return f"{self.start} --> {self.end}"
 
 
@@ -142,7 +195,7 @@ def is_valid_text(cls, value: str) -> str:
         return value
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue text span."""
         return self.text
 
@@ -154,7 +207,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     terminator: Optional[WebVTTLineTerminator] = None
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue component with terminator."""
         return f"{self.component}{self.terminator.value if self.terminator else ''}"
 
@@ -169,7 +222,7 @@ class WebVTTCueInternalText(BaseModel):
     ] = []
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue internal text."""
         cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
@@ -178,7 +231,7 @@ def __str__(self):
 class WebVTTCueSpanStartTag(BaseModel):
     """WebVTT cue span start tag."""
 
-    name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
+    name: Annotated[START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
         list[str],
         Field(description="List of classes representing the cue span's significance"),
@@ -200,7 +253,7 @@ def _get_name_with_classes(self) -> str:
         return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()}>"
 
@@ -228,7 +281,7 @@ def is_valid_annotation(cls, value: str) -> str:
         return value
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
@@ -270,7 +323,7 @@ def check_tag_names_match(self) -> Self:
         return self
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue component."""
         return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
 
@@ -391,7 +444,7 @@ def _create_text_components(
                 )
 
     @classmethod
-    def parse(cls, raw: str) -> "WebVTTCueBlock":
+    def parse(cls, raw: str) -> Self:
         """Parse a WebVTT cue block from a string.
 
         Args:
@@ -489,29 +542,50 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
             payload=stack[0],
         )
 
-    def __str__(self):
-        """Return a string representation of the WebVTT cue block."""
+    def format(self, omit_hours_if_zero: bool = False, omit_voice_end: bool = False) -> str:
+        """Format the WebVTT cue block as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+            omit_voice_end: If True and this cue block has a WebVTT cue voice span as
+                its only component, omit the voice end tag for brevity.
+
+        Returns:
+            Formatted cue block string.
+        """
         parts = []
         if self.identifier:
             parts.append(f"{self.identifier}\n")
-        timings_line = str(self.timings)
+        timings_line = self.timings.format(omit_hours_if_zero=omit_hours_if_zero)
         parts.append(timings_line + "\n")
         for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
-                # the end tag may be omitted for brevity
+            if omit_voice_end and idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
                 parts.append(str(span).removesuffix("</v>"))
             else:
                 parts.append(str(span))
 
         return "".join(parts) + "\n"
 
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT cue block.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        Always returns the WebVTT cue voice spans with the voice end tag, even if this
+        cue block has a WebVTT cue voice span as a single component in the payload. Use
+        `format(omit_voice_end=True)` to get a shorter representation without the voice
+        end tag.
+        """
+        return self.format()
+
 
 class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
-    title: Optional[str] = None
     cue_blocks: list[WebVTTCueBlock]
+    title: Optional[str] = None
 
     @staticmethod
     def verify_signature(content: str) -> bool:
@@ -544,7 +618,7 @@ def validate_start_time(self) -> Self:
         return self
 
     @classmethod
-    def parse(cls, raw: str) -> "WebVTTFile":
+    def parse(cls, raw: str) -> Self:
         """Parse a WebVTT file.
 
         Args:
@@ -579,14 +653,46 @@ def parse(cls, raw: str) -> "WebVTTFile":
 
         return cls(title=title, cue_blocks=cues)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[WebVTTCueBlock]:  # type: ignore[override]
         """Return an iterator over the cue blocks."""
         return iter(self.cue_blocks)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> WebVTTCueBlock:
         """Return the cue block at the given index."""
         return self.cue_blocks[idx]
 
-    def __len__(self):
+    def __len__(self) -> int:
         """Return the number of cue blocks."""
         return len(self.cue_blocks)
+
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the WebVTT file as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+
+        Returns:
+            Formatted WebVTT file string.
+        """
+        parts: list[str] = []
+
+        if self.title:
+            parts.append(f"WEBVTT {self.title}\n")
+        else:
+            parts.append("WEBVTT\n")
+
+        for cue_block in self.cue_blocks:
+            parts.append("\n")
+            parts.append(cue_block.format(omit_hours_if_zero=omit_hours_if_zero))
+
+        # Remove the trailing newline from the last cue block
+        return "".join(parts).rstrip("\n")
+
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT file.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
+        return self.format()
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 45a5d889..cea39ba5 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2324,9 +2324,9 @@
           "default": null,
           "description": "Classes for describing the cue significance",
           "examples": [
-            "first",
-            "loud",
-            "yellow"
+            "b.first",
+            "v.loud",
+            "c.yellow"
           ],
           "title": "Classes"
         }
diff --git a/test/data/doc/webvtt_example_01.gt.vtt b/test/data/doc/webvtt_example_01.gt.vtt
new file mode 100644
index 00000000..cad1c72a
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.gt.vtt
@@ -0,0 +1,40 @@
+WEBVTT
+
+00:00:11.000 --> 00:00:13.000
+<v Roger Bingham>We are in New York City</v>
+
+00:00:13.000 --> 00:00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street</v>
+
+00:00:16.000 --> 00:00:18.000
+<v Roger Bingham>from the American Museum of Natural History</v>
+
+00:00:18.000 --> 00:00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson</v>
+
+00:00:20.000 --> 00:00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium</v>
+
+00:00:22.000 --> 00:00:24.000
+<v Roger Bingham>at the AMNH.</v>
+
+00:00:24.000 --> 00:00:26.000
+<v Roger Bingham>Thank you for walking down here.</v>
+
+00:00:27.000 --> 00:00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.</v>
+
+00:00:30.000 --> 00:00:31.500
+<v Roger Bingham>When we e-mailed—</v>
+
+00:00:30.500 --> 00:00:32.500
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?</v>
+
+00:00:32.000 --> 00:00:35.500
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos</v>
+
+00:00:32.500 --> 00:00:33.500
+<v Neil deGrasse Tyson><i>Laughs</i></v>
+
+00:00:35.500 --> 00:00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
new file mode 100644
index 00000000..5a7c9d29
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.json
@@ -0,0 +1,313 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_01",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 16887312431371817791,
+    "filename": "webvtt_example_01.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 11.0,
+          "end_time": 13.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "We are in New York City",
+      "text": "We are in New York City"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 13.0,
+          "end_time": 16.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "We’re actually at the Lucern Hotel, just down the street",
+      "text": "We’re actually at the Lucern Hotel, just down the street"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 16.0,
+          "end_time": 18.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "from the American Museum of Natural History",
+      "text": "from the American Museum of Natural History"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 18.0,
+          "end_time": 20.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "And with me is Neil deGrasse Tyson",
+      "text": "And with me is Neil deGrasse Tyson"
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 20.0,
+          "end_time": 22.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "Astrophysicist, Director of the Hayden Planetarium",
+      "text": "Astrophysicist, Director of the Hayden Planetarium"
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 22.0,
+          "end_time": 24.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "at the AMNH.",
+      "text": "at the AMNH."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 24.0,
+          "end_time": 26.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "Thank you for walking down here.",
+      "text": "Thank you for walking down here."
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 27.0,
+          "end_time": 30.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "And I want to do a follow-up on the last conversation we did.",
+      "text": "And I want to do a follow-up on the last conversation we did."
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 30.0,
+          "end_time": 31.5,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "When we e-mailed—",
+      "text": "When we e-mailed—"
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 30.5,
+          "end_time": 32.5,
+          "voice": "Neil deGrasse Tyson"
+        }
+      ],
+      "orig": "Didn’t we talk about enough in that conversation?",
+      "text": "Didn’t we talk about enough in that conversation?"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 32.0,
+          "end_time": 35.5,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+      "text": "No! No no no no; 'cos 'cos obviously 'cos"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 32.5,
+          "end_time": 33.5,
+          "voice": "Neil deGrasse Tyson"
+        }
+      ],
+      "orig": "Laughs",
+      "text": "Laughs",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 35.5,
+          "end_time": 38.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "You know I’m so excited my glasses are falling off here.",
+      "text": "You know I’m so excited my glasses are falling off here."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt
new file mode 100644
index 00000000..8f9811e7
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.gt.vtt
@@ -0,0 +1,16 @@
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+<v.first.loud Esme>It’s a blue apple tree!</v>
+
+00:00:02.000 --> 00:00:04.000
+<v Mary>No way!</v>
+
+00:00:04.000 --> 00:00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:00:06.000 --> 00:00:08.000
+<v.loud Mary>That’s awesome!</v>
+
+00:00:08.000 --> 00:00:10.000
+Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
new file mode 100644
index 00000000..2966a2e0
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.json
@@ -0,0 +1,272 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_02",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 8584853280299071027,
+    "filename": "webvtt_example_02.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 0.0,
+          "end_time": 2.0,
+          "voice": "Esme",
+          "classes": [
+            "v.first.loud"
+          ]
+        }
+      ],
+      "orig": "It\u2019s a blue apple tree!",
+      "text": "It\u2019s a blue apple tree!"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 2.0,
+          "end_time": 4.0,
+          "voice": "Mary"
+        }
+      ],
+      "orig": "No way!",
+      "text": "No way!"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0,
+          "voice": "Esme"
+        }
+      ],
+      "orig": "Hee!",
+      "text": "Hee!"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": " ",
+      "text": " "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": "laughter",
+      "text": "laughter",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 6.0,
+          "end_time": 8.0,
+          "voice": "Mary",
+          "classes": [
+            "v.loud"
+          ]
+        }
+      ],
+      "orig": "That\u2019s awesome!",
+      "text": "That\u2019s awesome!"
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": "Sur les ",
+      "text": "Sur les "
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0,
+          "languages": [
+            "en"
+          ],
+          "classes": [
+            "i.foreignphrase"
+          ]
+        }
+      ],
+      "orig": "playground",
+      "text": "playground",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": ", ici \u00e0 Montpellier",
+      "text": ", ici \u00e0 Montpellier"
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.gt.vtt b/test/data/doc/webvtt_example_03.gt.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.gt.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
new file mode 100644
index 00000000..dddce0f2
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.json
@@ -0,0 +1,406 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_03",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11620880316586573676,
+    "filename": "webvtt_example_03.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      },
+      {
+        "$ref": "#/texts/13"
+      },
+      {
+        "$ref": "#/texts/14"
+      },
+      {
+        "$ref": "#/texts/15"
+      },
+      {
+        "$ref": "#/texts/16"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "OK,",
+      "text": "OK,"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I think now we should be recording",
+      "text": "I think now we should be recording"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.571,
+          "end_time": 9.403,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "properly.",
+      "text": "properly."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 10.683,
+          "end_time": 11.563,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+        }
+      ],
+      "orig": "Good.",
+      "text": "Good."
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 13.363,
+          "end_time": 13.803,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "Yeah.",
+      "text": "Yeah."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 49.603,
+          "end_time": 53.363,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "I was also thinking.",
+      "text": "I was also thinking."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 54.963,
+          "end_time": 62.072,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Would be maybe good to create items,",
+      "text": "Would be maybe good to create items,"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "some metadata,",
+      "text": "some metadata,"
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "some options that can be specific.",
+      "text": "some options that can be specific."
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "Yeah,",
+      "text": "Yeah,"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I mean I think you went even more than",
+      "text": "I mean I think you went even more than"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.563,
+          "end_time": 72.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "But we preserved the atoms.",
+      "text": "But we preserved the atoms."
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "than me.",
+      "text": "than me."
+    },
+    {
+      "self_ref": "#/texts/13",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I just opened the format.",
+      "text": "I just opened the format."
+    },
+    {
+      "self_ref": "#/texts/14",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 110.222,
+          "end_time": 111.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "give it a try, yeah.",
+      "text": "give it a try, yeah."
+    },
+    {
+      "self_ref": "#/texts/15",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 112.043,
+          "end_time": 115.043,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Okay, talk to you later.",
+      "text": "Okay, talk to you later."
+    },
+    {
+      "self_ref": "#/texts/16",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 114.603,
+          "end_time": 115.283,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "See you.",
+      "text": "See you."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt
new file mode 100644
index 00000000..ce7fcf65
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.gt.vtt
@@ -0,0 +1,9 @@
+WEBVTT Danger of Nitrogen
+
+00:00:01.000 --> 00:00:04.000
+Never drink liquid nitrogen.
+
+00:00:05.000 --> 00:00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
new file mode 100644
index 00000000..f96765fc
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.json
@@ -0,0 +1,194 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11822397499369478441,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/6"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "title",
+      "prov": [],
+      "orig": "Danger of Nitrogen",
+      "text": "Danger of Nitrogen"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 1.0,
+          "end_time": 4.0
+        }
+      ],
+      "orig": "Never drink liquid nitrogen.",
+      "text": "Never drink liquid nitrogen."
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 It will perforate your stomach.",
+      "text": "\u2014 It will perforate your stomach."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 You could ",
+      "text": "\u2014 You could "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "die",
+      "text": "die",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": ".",
+      "text": "."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "voice": "John"
+        }
+      ],
+      "orig": "This is true.",
+      "text": "This is true."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt
new file mode 100644
index 00000000..fd7b788c
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.gt.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
new file mode 100644
index 00000000..616c94fc
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.json
@@ -0,0 +1,344 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 5389775195091554844,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        },
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14580.0,
+          "end_time": 14760.0,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "Last night the chef surprised us with a culinary adventure.",
+      "text": "Last night the chef surprised us with a culinary adventure."
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The waiter offered a ",
+      "text": "The waiter offered a "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "steaming bowl of ",
+      "text": "steaming bowl of ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "es-ES"
+          ]
+        }
+      ],
+      "orig": "paella",
+      "text": "paella",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+      "text": " that instantly transported the diners to a sunny Mediterranean coast."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The dessert\u2019s ",
+      "text": "The dessert\u2019s "
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "unexpected",
+      "text": "unexpected",
+      "formatting": {
+        "bold": true,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " ",
+      "text": " ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "it"
+          ]
+        }
+      ],
+      "orig": "arcobaleno",
+      "text": "arcobaleno",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": true,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " of flavors",
+      "text": " of flavors",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " left everyone in awe.",
+      "text": " left everyone in awe."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 6fe3b386..fd68a347 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -15,6 +15,7 @@
     MarkdownParams,
     OrigListItemMarkerMode,
 )
+from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer
 from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
@@ -563,3 +564,27 @@ def test_html_inline_and_formatting():
     ser = HTMLDocSerializer(doc=doc)
     actual = ser.serialize().text
     verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+
+
+# ===============================
+# WebVTT tests
+# ===============================
+
+
+@pytest.mark.parametrize(
+    "file_name",
+    [
+        "webvtt_example_01",
+        "webvtt_example_02",
+        "webvtt_example_03",
+        "webvtt_example_04",
+        "webvtt_example_05",
+    ],
+)
+def test_webvtt(file_name):
+    src = Path(f"./test/data/doc/{file_name}.json")
+    doc = DoclingDocument.load_from_json(src)
+
+    ser = WebVTTDocSerializer(doc=doc)
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual)
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index a3443fd2..938da37c 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -255,7 +255,12 @@ def test_webvtt_file() -> None:
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
         "https://www.w3.org/TR/webvtt1/\n\n"
     )
-    reverse += "\n".join([str(block) for block in vtt.cue_blocks])
+    reverse += "\n".join(
+        [
+            block.format(omit_hours_if_zero=True, omit_voice_end=True)
+            for block in vtt.cue_blocks
+        ]
+    )
     assert content == reverse.rstrip()
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:

From 0b24861ee1d700c098aca894714c25ee125988df Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 19 Jan 2026 20:04:17 +0100
Subject: [PATCH 16/20] fix(webvtt): add 'text/vtt' as extra mimetype

Add 'text/vtt' as extra MIME type to support WebVTT serialization, since it is not
supported by 'mimetypes' with python < 3.11

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index dc0dbbf2..82e71751 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -960,6 +960,7 @@ class DocumentOrigin(BaseModel):
         "text/asciidoc",
         "text/markdown",
         "text/csv",
+        "text/vtt",
         "audio/x-wav",
         "audio/wav",
         "audio/mp3",

From 5e0a7870f842ce82b8ca09eed5264013e728ded3 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 22 Jan 2026 18:21:22 +0100
Subject: [PATCH 17/20] refactor(webvtt): roll back DocItem.prov as list of
 ProvenanceItem

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/experimental/idoctags.py         |  14 +-
 docling_core/transforms/serializer/azure.py   |  11 +-
 docling_core/transforms/serializer/common.py  |  15 +-
 docling_core/transforms/serializer/doctags.py |   6 +-
 docling_core/transforms/serializer/webvtt.py  |  14 +-
 .../visualizer/key_value_visualizer.py        |   5 +-
 .../visualizer/layout_visualizer.py           |   3 +-
 .../visualizer/reading_order_visualizer.py    |   3 +-
 .../transforms/visualizer/table_visualizer.py |  11 +-
 docling_core/types/doc/__init__.py            |   4 +-
 docling_core/types/doc/document.py            | 174 ++++---
 docling_core/types/doc/webvtt.py              |   2 +-
 docling_core/utils/legacy.py                  |   3 -
 docs/DoclingDocument.json                     | 467 +++++++++++-------
 test/data/doc/webvtt_example_01.json          |  39 +-
 test/data/doc/webvtt_example_02.json          |  27 +-
 test/data/doc/webvtt_example_03.json          |  51 +-
 test/data/doc/webvtt_example_04.json          |  18 +-
 test/data/doc/webvtt_example_05.json          |  33 +-
 test/test_deserializer_idoctags.py            |   4 +-
 test/test_doc_base.py                         |  12 +-
 test/test_serialization_doctag.py             |   3 +-
 test/test_serialization_idoctag.py            |  22 +-
 test/test_webvtt.py                           |   1 -
 24 files changed, 553 insertions(+), 389 deletions(-)

diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
index 7376062b..dd19d7f0 100644
--- a/docling_core/experimental/idoctags.py
+++ b/docling_core/experimental/idoctags.py
@@ -175,8 +175,6 @@ def _create_location_tokens_for_item(
         return ""
     out: list[str] = []
     for prov in item.prov:
-        if not isinstance(prov, ProvenanceItem):
-            continue
         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
         bbox = prov.bbox.to_top_left_origin(page_h).as_tuple()
         out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres))
@@ -1381,14 +1379,12 @@ def serialize(
             # we will need to do something more complex I believe ...
             res: list[SerializationResult] = []
             for idp, prov_ in enumerate(item.prov):
-                if not isinstance(prov_, ProvenanceItem):
-                    continue
-                item_: TextItem = copy.deepcopy(item)
+                item_ = copy.deepcopy(item)
                 item_.prov = [prov_]
                 item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]]  # it must be `orig`, not `text` here!
                 item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]]
-                if isinstance(item_.prov[0], ProvenanceItem):
-                    item_.prov[0].charspan = (0, len(item_.orig))
+
+                item_.prov[0].charspan = (0, len(item_.orig))
 
                 # marker field should be cleared on subsequent split parts
                 if idp > 0 and isinstance(item_, ListItem):
@@ -1752,7 +1748,7 @@ def _emit_otsl(
 
         if params.add_table_cell_location:
             # Check if we have all required information for location serialization
-            if item.prov and isinstance(item.prov[0], ProvenanceItem):
+            if item.prov and len(item.prov) > 0:
                 page_no = item.prov[0].page_no
                 if doc.pages and page_no in doc.pages:
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
@@ -1901,8 +1897,6 @@ def serialize(
             for it, _ in doc.iterate_items(root=item):
                 if isinstance(it, DocItem) and it.prov:
                     for prov in it.prov:
-                        if not isinstance(prov, ProvenanceItem):
-                            continue
                         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
                         boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple())
                         prov_page_w_h = (page_w, page_h, prov.page_no)
diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
index ed91aee2..1addf996 100644
--- a/docling_core/transforms/serializer/azure.py
+++ b/docling_core/transforms/serializer/azure.py
@@ -55,7 +55,6 @@
     ListGroup,
     NodeItem,
     PictureItem,
-    ProvenanceItem,
     RefItem,
     RichTableCell,
     TableItem,
@@ -77,7 +76,7 @@ def _bbox_to_polygon_coords(
 
 def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]:
     """Compute a TOPLEFT-origin polygon for the first provenance of the item."""
-    if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+    if not item.prov:
         return None
 
     prov = item.prov[0]
@@ -188,7 +187,7 @@ def serialize(
 
         # Lists may be represented either as TextItem(ListItem) or via groups;
         # we treat any TextItem as a paragraph-like entry.
-        if item.prov and isinstance(item.prov[0], ProvenanceItem):
+        if item.prov:
             prov = item.prov[0]
             page_no = prov.page_no
             polygon = _bbox_to_polygon_for_item(doc, item)
@@ -238,7 +237,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+        if not item.prov:
             return create_ser_result()
 
         prov = item.prov[0]
@@ -309,7 +308,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+        if not item.prov:
             return create_ser_result()
 
         prov = item.prov[0]
@@ -325,7 +324,7 @@ def serialize(
         for foot_ref in item.footnotes:
             if isinstance(foot_ref, RefItem):
                 tgt = foot_ref.resolve(doc)
-                if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem):
+                if isinstance(tgt, TextItem) and tgt.prov:
                     f_poly = _bbox_to_polygon_for_item(doc, tgt)
                     if f_poly is not None:
                         foots.append(
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index c36062e0..43bfd54b 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -52,7 +52,6 @@
     PictureDataType,
     PictureItem,
     PictureMoleculeData,
-    ProvenanceItem,
     Script,
     TableAnnotationType,
     TableItem,
@@ -109,7 +108,7 @@ def _iterate_items(
                     add_page_breaks=add_page_breaks,
                     visited=my_visited,
                 ):
-                    if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem):
+                    if isinstance(it, DocItem) and it.prov:
                         page_no = it.prov[0].page_no
                         if prev_page_nr is not None and page_no > prev_page_nr:
                             yield (
@@ -121,7 +120,7 @@ def _iterate_items(
                                 lvl,
                             )
                         break
-            elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem):
+            elif isinstance(item, DocItem) and item.prov:
                 page_no = item.prov[0].page_no
                 if prev_page_nr is None or page_no > prev_page_nr:
                     if prev_page_nr is not None:  # close previous range
@@ -302,13 +301,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
                             or item.content_layer not in params.layers
                             or (
                                 params.pages is not None
-                                and (
-                                    (not item.prov)
-                                    or (
-                                        isinstance(item.prov[0], ProvenanceItem)
-                                        and item.prov[0].page_no not in params.pages
-                                    )
-                                )
+                                and ((not item.prov) or item.prov[0].page_no not in params.pages)
                             )
                         )
                     )
@@ -355,6 +348,7 @@ def serialize(
         empty_res = create_ser_result()
 
         my_item = item or self.doc.body
+
         if my_item == self.doc.body:
             if my_item.meta and not self._meta_is_wrapped():
                 meta_part = self.serialize_meta(item=my_item, **my_kwargs)
@@ -677,7 +671,6 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
             if (
                 isinstance(item, DocItem)
                 and item.prov
-                and isinstance(item.prov[0], ProvenanceItem)
                 and (self.params.pages is None or item.prov[0].page_no in self.params.pages)
                 and ix >= self.params.start_idx
                 and ix < self.params.stop_idx
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index 16549652..dc8c520f 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -345,7 +345,7 @@ def serialize(
         results: list[SerializationResult] = []
 
         page_no = 1
-        if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
+        if len(item.prov) > 0:
             page_no = item.prov[0].page_no
 
         if params.add_location:
@@ -363,7 +363,7 @@ def serialize(
 
         for cell in item.graph.cells:
             cell_txt = ""
-            if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
+            if cell.prov is not None:
                 if len(doc.pages.keys()):
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
                     cell_txt += DocumentToken.get_location(
@@ -471,7 +471,7 @@ def _get_inline_location_tags(
         doc_items: list[DocItem] = []
         for it, _ in doc.iterate_items(root=item):
             if isinstance(it, DocItem):
-                for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
+                for prov in it.prov:
                     boxes.append(prov.bbox)
                     doc_items.append(it)
         if prov is None:
diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index 15fdbc3b..bfd1fd55 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -38,10 +38,10 @@
     ListGroup,
     NodeItem,
     PictureItem,
-    ProvenanceTrack,
     TableItem,
     TextItem,
     TitleItem,
+    TrackProvenance,
 )
 from docling_core.types.doc.webvtt import (
     START_TAG_NAMES,
@@ -140,15 +140,15 @@ def serialize(
         if isinstance(item, TitleItem):
             return create_ser_result(text=item.text, span_source=item)
 
-        # Only process items with ProvenanceTrack (WebVTT cues)
-        if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack):
+        # Only process items with TrackProvenance (WebVTT cues)
+        if not item.text or not item.source or item.source[0].kind != "track":
             return create_ser_result()
 
         # Apply post-processing here: formatting, classes, language, and voice
         # If the TextItem is part of an InlineGroup, we need to further post-process it
         # within the group context
 
-        prov: ProvenanceTrack = item.prov[0]
+        prov: TrackProvenance = item.source[0]
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
@@ -417,7 +417,7 @@ def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
         """Extract tag and values from provenance classes.
 
         Args:
-            classes: The classes from a ProvenanceTrack object.
+            classes: The classes from a TrackProvenance object.
 
         Returns:
             Map of tag to class values.
@@ -463,8 +463,8 @@ def serialize_doc(
                 continue
             if isinstance(doc_item, InlineGroup) and doc_item.children:
                 doc_item = doc_item.children[0].resolve(doc=self.doc)
-            if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack):
-                prov: ProvenanceTrack = doc_item.prov[0]
+            if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track":
+                prov: TrackProvenance = doc_item.source[0]
                 if (
                     prov.identifier == id
                     and timings
diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py
index e2b10264..89b07f77 100644
--- a/docling_core/transforms/visualizer/key_value_visualizer.py
+++ b/docling_core/transforms/visualizer/key_value_visualizer.py
@@ -21,7 +21,6 @@
     DoclingDocument,
     GraphCellLabel,
     GraphLinkLabel,
-    ProvenanceItem,
 )
 
 # ---------------------------------------------------------------------------
@@ -87,7 +86,7 @@ def _draw_key_value_layer(
             # First draw cells (rectangles + optional labels)
             # ------------------------------------------------------------------
             for cell in cell_dict.values():
-                if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no:
+                if cell.prov is None or cell.prov.page_no != page_no:
                     continue  # skip cells not on this page or without bbox
 
                 tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height)
@@ -154,8 +153,6 @@ def _draw_key_value_layer(
                 if (
                     src_cell.prov is None
                     or tgt_cell.prov is None
-                    or not isinstance(src_cell.prov, ProvenanceItem)
-                    or not isinstance(tgt_cell.prov, ProvenanceItem)
                     or src_cell.prov.page_no != page_no
                     or tgt_cell.prov.page_no != page_no
                 ):
diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py
index 8ac6bf81..043fedac 100644
--- a/docling_core/transforms/visualizer/layout_visualizer.py
+++ b/docling_core/transforms/visualizer/layout_visualizer.py
@@ -17,7 +17,6 @@
     DocItem,
     DocItemLabel,
     DoclingDocument,
-    ProvenanceItem,
     TextCell,
 )
 
@@ -179,7 +178,7 @@ def _draw_doc_layout(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
+            for prov in elem.prov:
                 page_nr = prov.page_no
 
                 if page_nr in my_images:
diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py
index 27583613..60874333 100644
--- a/docling_core/transforms/visualizer/reading_order_visualizer.py
+++ b/docling_core/transforms/visualizer/reading_order_visualizer.py
@@ -14,7 +14,6 @@
     DocItem,
     DoclingDocument,
     PictureItem,
-    ProvenanceItem,
 )
 
 
@@ -131,7 +130,7 @@ def _draw_doc_reading_order(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
+            for prov in elem.prov:
                 page_no = prov.page_no
                 image = my_images.get(page_no)
 
diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py
index d3790d6b..5f601f9a 100644
--- a/docling_core/transforms/visualizer/table_visualizer.py
+++ b/docling_core/transforms/visualizer/table_visualizer.py
@@ -10,12 +10,7 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc import (
-    ContentLayer,
-    DoclingDocument,
-    ProvenanceItem,
-    TableItem,
-)
+from docling_core.types.doc import ContentLayer, DoclingDocument, TableItem
 
 _log = logging.getLogger(__name__)
 
@@ -190,10 +185,10 @@ def _draw_doc_tables(
                 image = pil_img.copy()
                 my_images[page_nr] = image
 
-        for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
+        for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
             if not isinstance(elem, TableItem):
                 continue
-            if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
+            if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
             if len(elem.prov) == 1:
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index d8ddd0b4..c3a2b237 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -46,6 +46,7 @@
     PictureClassificationClass,
     PictureClassificationData,
     PictureClassificationMetaField,
+    PictureClassificationPrediction,
     PictureDataType,
     PictureItem,
     PictureLineChartData,
@@ -56,7 +57,7 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
-    ProvenanceTrack,
+    ProvenanceType,
     RefItem,
     RichTableCell,
     Script,
@@ -69,6 +70,7 @@
     TabularChartMetaField,
     TextItem,
     TitleItem,
+    TrackProvenance,
     UnorderedList,
 )
 from .labels import (
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 82e71751..e864574f 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -32,12 +32,10 @@
     AnyUrl,
     BaseModel,
     ConfigDict,
-    Discriminator,
     Field,
     FieldSerializationInfo,
     SerializerFunctionWrapHandler,
     StringConstraints,
-    Tag,
     computed_field,
     field_serializer,
     field_validator,
@@ -1170,13 +1168,27 @@ class ProvenanceItem(BaseModel):
     charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
 
 
-class ProvenanceTrack(BaseModel):
-    """Provenance information for elements extracted from media assets.
+class BaseProvenance(BaseModel):
+    """Base class for provenance information.
 
-    A `ProvenanceTrack` instance describes a cue in a text track associated with a
-    media element (audio, video, subtitles, screen recordings, ...).
+    Represents the provenance of an extracted component within a digital asset.
     """
 
+    kind: Annotated[
+        str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.")
+    ]
+
+
+class TrackProvenance(BaseProvenance):
+    """Provenance metadata for a cue extracted from a media track.
+
+    A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,
+    etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle
+    block, an audio clip, or a timed marker in a screen-recording.
+    """
+
+    kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
+
     start_time: Annotated[
         float,
         Field(
@@ -1221,25 +1233,22 @@ def check_order(self) -> Self:
         return self
 
 
-def _get_provenance_discriminator_value(v: Any) -> str:
-    """Callable discriminator for provenance instances.
+ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")]
+"""Union type for all provenance types.
 
-    Args:
-        v: Either dict or model input.
-
-    Returns:
-        A string discriminator of provenance instances.
-    """
-    fields = {"bbox", "page_no", "charspan"}
-    if isinstance(v, dict):
-        return "item" if any(f in v for f in fields) else "track"
-    return "item" if any(hasattr(v, f) for f in fields) else "track"
+This type alias represents a discriminated union of all available provenance types that can be associated with
+extracted elements in a document. The `kind` field is used as a discriminator to determine the specific
+provenance type at runtime.
 
+Currently supported provenance types:
+    - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles)
 
-ProvenanceType = Annotated[
-    Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
-    Discriminator(_get_provenance_discriminator_value),
-]
+Notes:
+    - Additional provenance types may be added to this union in the future to support
+        other content sources.
+    - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the
+        `ProvenanceItem` should still be used.
+"""
 
 
 class ContentLayer(str, Enum):
@@ -1544,20 +1553,28 @@ class FineRef(RefItem):
     range: Optional[tuple[int, int]] = None  # start_inclusive, end_exclusive
 
 
-class DocItem(NodeItem):  # Base type for any element that carries content, can be a leaf node
-    """DocItem."""
+class DocItem(NodeItem):
+    """Base type for any element that carries content, can be a leaf node."""
 
     label: DocItemLabel
-    prov: list[ProvenanceType] = []
+    prov: list[ProvenanceItem] = []
+    source: Annotated[
+        list[ProvenanceType],
+        Field(
+            description="The provenance of this document item. Currently, it is only used for media track provenance."
+        ),
+    ] = []
     comments: list[FineRef] = []  # References to comment items annotating this content
 
     @model_serializer(mode="wrap")
     def _custom_pydantic_serialize(self, handler: SerializerFunctionWrapHandler) -> dict:
         dumped = handler(self)
 
-        # suppress serializing comment list when empty:
-        if dumped.get("comments") == []:
-            del dumped["comments"]
+        # suppress serializing comment and source lists when empty:
+        for field in {"comments", "source"}:
+            if dumped.get(field) == []:
+                del dumped[field]
+
         return dumped
 
     def get_location_tokens(
@@ -1573,7 +1590,7 @@ def get_location_tokens(
             return ""
 
         location = ""
-        for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)):
+        for prov in self.prov:
             page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
 
             loc_str = DocumentToken.get_location(
@@ -1609,9 +1626,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if not page_image:
             return None
         crop_bbox = (
-            prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size(
-                old_size=page.size, new_size=page.image.size
-            )
+            self.prov[prov_index]
+            .bbox.to_top_left_origin(page_height=page.size.height)
+            .scale_to_size(old_size=page.size, new_size=page.image.size)
             # .scaled(scale=page_image.height / page.size.height)
         )
         return page_image.crop(crop_bbox.as_tuple())
@@ -2282,7 +2299,7 @@ def export_to_otsl(
             return ""
 
         page_no = 0
-        if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem):
+        if len(self.prov) > 0:
             page_no = self.prov[0].page_no
 
         for i in range(nrows):
@@ -2412,7 +2429,7 @@ class GraphCell(BaseModel):
     text: str  # sanitized text
     orig: str  # text as seen on document
 
-    prov: Optional[ProvenanceType] = None
+    prov: Optional[ProvenanceItem] = None
 
     # in case you have a text, table or picture item
     item_ref: Optional[RefItem] = None
@@ -3061,7 +3078,7 @@ def add_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3072,7 +3089,7 @@ def add_list_item(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3113,7 +3130,7 @@ def add_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3124,7 +3141,7 @@ def add_text(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3250,7 +3267,7 @@ def add_table(
         self,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
@@ -3260,7 +3277,7 @@ def add_table(
 
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
 
@@ -3296,7 +3313,7 @@ def add_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
     ):
@@ -3305,7 +3322,7 @@ def add_picture(
         :param data: Optional[list[PictureData]]: (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3337,7 +3354,7 @@ def add_title(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3348,7 +3365,7 @@ def add_title(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3383,7 +3400,7 @@ def add_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3396,7 +3413,7 @@ def add_code(
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3434,7 +3451,7 @@ def add_formula(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3445,7 +3462,7 @@ def add_formula(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3479,7 +3496,7 @@ def add_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3491,7 +3508,7 @@ def add_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3524,13 +3541,13 @@ def add_heading(
     def add_key_values(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_key_values.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3555,13 +3572,13 @@ def add_key_values(
     def add_form(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_form.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3750,7 +3767,7 @@ def insert_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3763,7 +3780,7 @@ def insert_list_item(
         :param enumerated: bool:  (Default value = False)
         :param marker: Optional[str]:  (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3822,7 +3839,7 @@ def insert_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3834,7 +3851,7 @@ def insert_text(
         :param label: DocItemLabel:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3934,7 +3951,7 @@ def insert_table(
         sibling: NodeItem,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
         annotations: Optional[list[TableAnnotationType]] = None,
@@ -3945,7 +3962,7 @@ def insert_table(
         :param sibling: NodeItem:
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param annotations: Optional[list[TableAnnotationType]]: (Default value = None)
@@ -3982,7 +3999,7 @@ def insert_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         after: bool = True,
     ) -> PictureItem:
@@ -3992,7 +4009,7 @@ def insert_picture(
         :param annotations: Optional[list[PictureDataType]]: (Default value = None)
         :param image: Optional[ImageRef]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
@@ -4026,7 +4043,7 @@ def insert_title(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4037,7 +4054,7 @@ def insert_title(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4077,7 +4094,7 @@ def insert_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4090,7 +4107,7 @@ def insert_code(
         :param code_language: Optional[str]: (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4132,7 +4149,7 @@ def insert_formula(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4143,7 +4160,7 @@ def insert_formula(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4182,7 +4199,7 @@ def insert_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4194,7 +4211,7 @@ def insert_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4232,14 +4249,14 @@ def insert_key_values(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         after: bool = True,
     ) -> KeyValueItem:
         """Creates a new KeyValueItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: KeyValueItem: The newly created KeyValueItem item.
@@ -4261,14 +4278,14 @@ def insert_form(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         after: bool = True,
     ) -> FormItem:
         """Creates a new FormItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: FormItem: The newly created FormItem item.
@@ -4605,10 +4622,7 @@ def _iterate_items_with_stack(
             (not isinstance(root, GroupItem) or with_groups)
             and (
                 not isinstance(root, DocItem)
-                or (
-                    page_nrs is None
-                    or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem))
-                )
+                or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov))
             )
             and root.content_layer in my_layers
         )
@@ -4730,7 +4744,7 @@ def _with_pictures_refs(
                             else:
                                 obj_path = loc_path
 
-                            if item.image is None and isinstance(item.prov[0], ProvenanceItem):
+                            if item.image is None:
                                 scale = img.size[0] / item.prov[0].bbox.width
                                 item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale))
                             elif item.image is not None:
@@ -6136,7 +6150,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
                     if isinstance(new_item, DocItem):
                         # update page numbers
                         # NOTE other prov sources (e.g. GraphCell) currently not covered
-                        for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)):
+                        for prov in new_item.prov:
                             prov.page_no += page_delta
 
                     if item.parent:
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6bc4a219..297e97fb 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -406,7 +406,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
+    identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
index 5ebac4be..26042436 100644
--- a/docling_core/utils/legacy.py
+++ b/docling_core/utils/legacy.py
@@ -165,7 +165,6 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
                         span=[0, len(item.text)],
                     )
                     for p in item.prov
-                    if isinstance(p, ProvenanceItem)
                 ]
                 main_text.append(
                     BaseText(
@@ -287,7 +286,6 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, 0],
                             )
                             for p in item.prov
-                            if isinstance(p, ProvenanceItem)
                         ],
                     )
                 )
@@ -315,7 +313,6 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, len(caption)],
                             )
                             for p in item.prov
-                            if isinstance(p, ProvenanceItem)
                         ],
                         obj_type=doc_item_label_to_legacy_type(item.label),
                         text=caption,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index cea39ba5..b37260eb 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -233,16 +233,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -658,16 +670,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -807,16 +831,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -894,21 +930,13 @@
         "prov": {
           "anyOf": [
             {
-              "oneOf": [
-                {
-                  "$ref": "#/$defs/ProvenanceItem"
-                },
-                {
-                  "$ref": "#/$defs/ProvenanceTrack"
-                }
-              ]
+              "$ref": "#/$defs/ProvenanceItem"
             },
             {
               "type": "null"
             }
           ],
-          "default": null,
-          "title": "Prov"
+          "default": null
         },
         "item_ref": {
           "anyOf": [
@@ -1227,16 +1255,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -1406,16 +1446,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -1789,16 +1841,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -2224,120 +2288,6 @@
       "title": "ProvenanceItem",
       "type": "object"
     },
-    "ProvenanceTrack": {
-      "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
-      "properties": {
-        "start_time": {
-          "description": "Start time offset of the track cue in seconds",
-          "examples": [
-            11.0,
-            6.5,
-            5370.0
-          ],
-          "title": "Start Time",
-          "type": "number"
-        },
-        "end_time": {
-          "description": "End time offset of the track cue in seconds",
-          "examples": [
-            12.0,
-            8.2,
-            5370.1
-          ],
-          "title": "End Time",
-          "type": "number"
-        },
-        "identifier": {
-          "anyOf": [
-            {
-              "type": "string"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "An identifier of the cue",
-          "examples": [
-            "test",
-            "123",
-            "b72d946"
-          ],
-          "title": "Identifier"
-        },
-        "voice": {
-          "anyOf": [
-            {
-              "type": "string"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "The cue voice (speaker)",
-          "examples": [
-            "Mary",
-            "Fred",
-            "Name Surname"
-          ],
-          "title": "Voice"
-        },
-        "languages": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "Languages of the cue in BCP 47 language tag format",
-          "examples": [
-            [
-              "en",
-              "en-GB"
-            ],
-            [
-              "fr-CA"
-            ]
-          ],
-          "title": "Languages"
-        },
-        "classes": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "minItems": 1,
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "Classes for describing the cue significance",
-          "examples": [
-            "b.first",
-            "v.loud",
-            "c.yellow"
-          ],
-          "title": "Classes"
-        }
-      },
-      "required": [
-        "start_time",
-        "end_time"
-      ],
-      "title": "ProvenanceTrack",
-      "type": "object"
-    },
     "RefItem": {
       "description": "RefItem.",
       "properties": {
@@ -2494,16 +2444,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -2796,16 +2758,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3008,16 +2982,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3127,16 +3113,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3192,6 +3190,127 @@
       ],
       "title": "TitleItem",
       "type": "object"
+    },
+    "TrackProvenance": {
+      "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.",
+      "properties": {
+        "kind": {
+          "const": "track",
+          "default": "track",
+          "description": "Identifiers this type of provenance.",
+          "title": "Kind",
+          "type": "string"
+        },
+        "start_time": {
+          "description": "Start time offset of the track cue in seconds",
+          "examples": [
+            11.0,
+            6.5,
+            5370.0
+          ],
+          "title": "Start Time",
+          "type": "number"
+        },
+        "end_time": {
+          "description": "End time offset of the track cue in seconds",
+          "examples": [
+            12.0,
+            8.2,
+            5370.1
+          ],
+          "title": "End Time",
+          "type": "number"
+        },
+        "identifier": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "An identifier of the cue",
+          "examples": [
+            "test",
+            "123",
+            "b72d946"
+          ],
+          "title": "Identifier"
+        },
+        "voice": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The cue voice (speaker)",
+          "examples": [
+            "Mary",
+            "Fred",
+            "Name Surname"
+          ],
+          "title": "Voice"
+        },
+        "languages": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Languages of the cue in BCP 47 language tag format",
+          "examples": [
+            [
+              "en",
+              "en-GB"
+            ],
+            [
+              "fr-CA"
+            ]
+          ],
+          "title": "Languages"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "minItems": 1,
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Classes for describing the cue significance",
+          "examples": [
+            "b.first",
+            "v.loud",
+            "c.yellow"
+          ],
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "TrackProvenance",
+      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
index 5a7c9d29..78ce13b6 100644
--- a/test/data/doc/webvtt_example_01.json
+++ b/test/data/doc/webvtt_example_01.json
@@ -71,8 +71,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 11.0,
           "end_time": 13.0,
           "voice": "Roger Bingham"
@@ -89,8 +90,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 13.0,
           "end_time": 16.0,
           "voice": "Roger Bingham"
@@ -107,8 +109,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 16.0,
           "end_time": 18.0,
           "voice": "Roger Bingham"
@@ -125,8 +128,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 18.0,
           "end_time": 20.0,
           "voice": "Roger Bingham"
@@ -143,8 +147,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 20.0,
           "end_time": 22.0,
           "voice": "Roger Bingham"
@@ -161,8 +166,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 22.0,
           "end_time": 24.0,
           "voice": "Roger Bingham"
@@ -179,8 +185,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 24.0,
           "end_time": 26.0,
           "voice": "Roger Bingham"
@@ -197,8 +204,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 27.0,
           "end_time": 30.0,
           "voice": "Roger Bingham"
@@ -215,8 +223,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 30.0,
           "end_time": 31.5,
           "voice": "Roger Bingham"
@@ -233,8 +242,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 30.5,
           "end_time": 32.5,
           "voice": "Neil deGrasse Tyson"
@@ -251,8 +261,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 32.0,
           "end_time": 35.5,
           "voice": "Roger Bingham"
@@ -269,8 +280,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 32.5,
           "end_time": 33.5,
           "voice": "Neil deGrasse Tyson"
@@ -294,8 +306,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 35.5,
           "end_time": 38.0,
           "voice": "Roger Bingham"
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
index 2966a2e0..35c53692 100644
--- a/test/data/doc/webvtt_example_02.json
+++ b/test/data/doc/webvtt_example_02.json
@@ -88,8 +88,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 0.0,
           "end_time": 2.0,
           "voice": "Esme",
@@ -109,8 +110,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 2.0,
           "end_time": 4.0,
           "voice": "Mary"
@@ -127,8 +129,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0,
           "voice": "Esme"
@@ -145,8 +148,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0
         }
@@ -162,8 +166,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0
         }
@@ -186,8 +191,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 6.0,
           "end_time": 8.0,
           "voice": "Mary",
@@ -207,8 +213,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0
         }
@@ -224,8 +231,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0,
           "languages": [
@@ -254,8 +262,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0
         }
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
index dddce0f2..42d9e5b2 100644
--- a/test/data/doc/webvtt_example_03.json
+++ b/test/data/doc/webvtt_example_03.json
@@ -83,8 +83,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
@@ -102,8 +103,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
@@ -121,8 +123,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.571,
           "end_time": 9.403,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
@@ -140,8 +143,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 10.683,
           "end_time": 11.563,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
@@ -158,8 +162,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 13.363,
           "end_time": 13.803,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
@@ -177,8 +182,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 49.603,
           "end_time": 53.363,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
@@ -196,8 +202,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 54.963,
           "end_time": 62.072,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
@@ -215,8 +222,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
@@ -234,8 +242,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
@@ -253,8 +262,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
@@ -272,8 +282,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
@@ -291,8 +302,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.563,
           "end_time": 72.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
@@ -310,8 +322,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
@@ -329,8 +342,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
@@ -348,8 +362,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 110.222,
           "end_time": 111.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
@@ -367,8 +382,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 112.043,
           "end_time": 115.043,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
@@ -386,8 +402,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 114.603,
           "end_time": 115.283,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
index f96765fc..7e12385d 100644
--- a/test/data/doc/webvtt_example_04.json
+++ b/test/data/doc/webvtt_example_04.json
@@ -80,8 +80,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 1.0,
           "end_time": 4.0
         }
@@ -97,8 +98,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -114,8 +116,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -131,8 +134,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
           "classes": [
@@ -158,8 +162,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -175,8 +180,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
           "voice": "John"
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
index 616c94fc..9a53b3b0 100644
--- a/test/data/doc/webvtt_example_05.json
+++ b/test/data/doc/webvtt_example_05.json
@@ -94,8 +94,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14580.0,
           "end_time": 14760.0,
           "identifier": "agcvs-08234"
@@ -112,8 +113,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -130,8 +132,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -155,8 +158,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -183,8 +187,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -201,8 +206,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -219,8 +225,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -247,8 +254,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -272,8 +280,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -300,8 +309,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -325,8 +335,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py
index 58fb50db..28b41ad6 100644
--- a/test/test_deserializer_idoctags.py
+++ b/test/test_deserializer_idoctags.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from test.test_serialization_doctag import verify
 
 import pytest
 
@@ -21,7 +20,8 @@
     TableData,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from test.test_serialization_idoctag import add_texts_section, add_list_section
+from test.test_serialization_doctag import verify
+from test.test_serialization_idoctag import add_list_section, add_texts_section
 
 DO_PRINT: bool = False
 
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 2d1ce498..45a9445c 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc import ProvenanceTrack
+from docling_core.types.doc import TrackProvenance
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -41,9 +41,9 @@ def test_prov():
 
 
 def test_prov_track():
-    """Test the class ProvenanceTrack."""
+    """Test the class TrackProvenance."""
 
-    valid_track = ProvenanceTrack(
+    valid_track = TrackProvenance(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
@@ -61,17 +61,17 @@ def test_prov_track():
     assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
-        ProvenanceTrack(start_time=11.0)
+        TrackProvenance(start_time=11.0)
 
     with pytest.raises(ValidationError, match="should be a valid list"):
-        ProvenanceTrack(
+        TrackProvenance(
             start_time=11.0,
             end_time=12.0,
             languages="en",
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
-        ProvenanceTrack(
+        TrackProvenance(
             start_time=11.0,
             end_time=11.0,
         )
diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py
index 45d0c983..9b378b03 100644
--- a/test/test_serialization_doctag.py
+++ b/test/test_serialization_doctag.py
@@ -6,8 +6,7 @@
     DocTagsDocSerializer,
     DocTagsParams,
 )
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DoclingDocument, TableData
+from docling_core.types.doc import DoclingDocument, TableData
 from docling_core.types.doc.labels import DocItemLabel
 
 from .test_serialization import verify
diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py
index 43aaa79e..1c0f8479 100644
--- a/test/test_serialization_idoctag.py
+++ b/test/test_serialization_idoctag.py
@@ -2,37 +2,39 @@
 
 from pathlib import Path
 from typing import Optional
-from test.test_serialization import verify
 
 import pytest
 
 from docling_core.experimental.idoctags import (
     ContentType,
-    WrapMode,
     EscapeMode,
     IDocTagsDocSerializer,
     IDocTagsParams,
     IDocTagsSerializationMode,
     IDocTagsVocabulary,
+    WrapMode,
 )
 from docling_core.types.doc import (
+    BoundingBox,
+    CodeLanguageLabel,
+    CoordOrigin,
+    DescriptionMetaField,
     DocItemLabel,
     DoclingDocument,
     Formatting,
-    Script,
-    TableData,
-)
-from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.document import (
-    DescriptionMetaField,
+    PictureClassificationLabel,
     PictureClassificationMetaField,
     PictureClassificationPrediction,
     PictureMeta,
     ProvenanceItem,
+    Script,
+    Size,
     SummaryMetaField,
+    TableData,
     TabularChartMetaField,
 )
-from docling_core.types.doc.labels import CodeLanguageLabel, PictureClassificationLabel
+from test.test_serialization import verify
+
 
 def add_texts_section(doc: DoclingDocument):
     doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
@@ -427,7 +429,7 @@ def test_content_allow_all_types(sample_doc: DoclingDocument):
     serializer = IDocTagsDocSerializer(
         doc=doc,
         params=IDocTagsParams(
-            content_types={ct for ct in ContentType},
+            content_types=set(ContentType),
         ),
     )
     ser_txt = serializer.serialize().text
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 938da37c..5b1693e3 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -1,6 +1,5 @@
 """Test the data model for WebVTT files.
 
-Assisted by watsonx Code Assistant.
 Examples extracted from https://www.w3.org/TR/webvtt1/
 Copyright © 2019 World Wide Web Consortium.
 """

From 00a355d81f8b26f0ee9ea2cbce0831c72709f062 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 23 Jan 2026 16:28:56 +0100
Subject: [PATCH 18/20] tests(webvtt): fix test with STYLE and NOTE blocks

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 test/test_serialization_doctag.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py
index 9b378b03..86237a9a 100644
--- a/test/test_serialization_doctag.py
+++ b/test/test_serialization_doctag.py
@@ -6,8 +6,7 @@
     DocTagsDocSerializer,
     DocTagsParams,
 )
-from docling_core.types.doc import DoclingDocument, TableData
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TableData
 
 from .test_serialization import verify
 

From 818fc626cb5cab7a7ce6c2bd29125c472702a362 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 23 Jan 2026 19:04:04 +0100
Subject: [PATCH 19/20] style(webvtt): apply X | Y annotation instead of
 Optional, Union

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py | 26 +++++++++---------
 docling_core/types/doc/webvtt.py             | 28 +++++++++-----------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index bfd1fd55..bbc6e344 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import Any, Optional, get_args
+from typing import Any, get_args
 
 from pydantic import BaseModel
 from typing_extensions import override
@@ -132,7 +132,7 @@ def serialize(
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
         is_inline_scope: bool = False,
-        visited: Optional[set[str]] = None,
+        visited: set[str] | None = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
@@ -158,7 +158,7 @@ def serialize(
         )
         if is_inline_scope:
             # Iteratively remove unnecessary consecutive tag pairs until no more changes
-            prev_text: Optional[str] = None
+            prev_text: str | None = None
             while prev_text != text:
                 prev_text = text
                 text = _remove_consecutive_pairs(text)
@@ -275,7 +275,7 @@ def serialize(
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         list_level: int = 0,
-        visited: Optional[set[str]] = None,
+        visited: set[str] | None = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes an inline group to WebVTT format."""
@@ -343,7 +343,7 @@ class WebVTTDocSerializer(DocSerializer):
     fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer()
     list_serializer: BaseListSerializer = _WebVTTListSerializer()
     inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer()
-    meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer()
+    meta_serializer: BaseMetaSerializer | None = _WebVTTMetaSerializer()
     annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer()
 
     params: CommonParams = CommonParams()
@@ -393,7 +393,7 @@ def serialize_cue_span(
         self,
         text: str,
         tag: START_TAG_NAMES,
-        anno: Optional[str] = None,
+        anno: str | None = None,
         css: list[str] = [],
     ) -> str:
         """Apply serialization to a WebVTT cue span."""
@@ -442,10 +442,10 @@ def serialize_doc(
         **kwargs: Any,
     ) -> SerializationResult:
         """Serialize a document out of its parts."""
-        title: Optional[str] = None
+        title: str | None = None
 
-        timings: Optional[WebVTTCueTimings] = None
-        id: Optional[str] = None
+        timings: WebVTTCueTimings | None = None
+        id: str | None = None
         text: str = ""
         cue_blocks: list[WebVTTCueBlock] = []
         for part in parts:
@@ -503,10 +503,10 @@ def serialize_doc(
     def post_process(
         self,
         text: str,
-        formatting: Optional[Formatting] = None,
-        voice: Optional[str] = None,
-        languages: Optional[list[str]] = None,
-        classes: Optional[list[str]] = None,
+        formatting: Formatting | None = None,
+        voice: str | None = None,
+        languages: list[str] | None = None,
+        classes: list[str] | None = None,
         **kwargs: Any,
     ) -> str:
         """Apply some text post-processing steps by adding formatting tags.
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 297e97fb..f7c4eea6 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -5,7 +5,7 @@
 from collections.abc import Iterator
 from enum import Enum
 from functools import total_ordering
-from typing import Annotated, ClassVar, Literal, Optional, Union
+from typing import Annotated, ClassVar, Literal
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
@@ -204,7 +204,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
     component: "WebVTTCueComponent"
-    terminator: Optional[WebVTTLineTerminator] = None
+    terminator: WebVTTLineTerminator | None = None
 
     @override
     def __str__(self) -> str:
@@ -215,7 +215,7 @@ def __str__(self) -> str:
 class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
-    terminator: Optional[WebVTTLineTerminator] = None
+    terminator: WebVTTLineTerminator | None = None
     components: Annotated[
         list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
@@ -380,15 +380,13 @@ class WebVTTCueLanguageSpan(WebVTTCueComponentBase):
 
 
 WebVTTCueComponent = Annotated[
-    Union[
-        WebVTTCueTextSpan,
-        WebVTTCueClassSpan,
-        WebVTTCueItalicSpan,
-        WebVTTCueBoldSpan,
-        WebVTTCueUnderlineSpan,
-        WebVTTCueVoiceSpan,
-        WebVTTCueLanguageSpan,
-    ],
+    WebVTTCueTextSpan
+    | WebVTTCueClassSpan
+    | WebVTTCueItalicSpan
+    | WebVTTCueBoldSpan
+    | WebVTTCueUnderlineSpan
+    | WebVTTCueVoiceSpan
+    | WebVTTCueLanguageSpan,
     Field(
         discriminator="kind",
         description="The type of WebVTT caption or subtitle cue component.",
@@ -406,7 +404,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None
+    identifier: Annotated[WebVTTCueIdentifier | None, Field(description="The WebVTT cue identifier")] = None
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
@@ -456,7 +454,7 @@ def parse(cls, raw: str) -> Self:
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
-        identifier: Optional[WebVTTCueIdentifier] = None
+        identifier: WebVTTCueIdentifier | None = None
         timing_line = lines[0]
         if "-->" not in timing_line and len(lines) > 1:
             identifier = timing_line
@@ -585,7 +583,7 @@ class WebVTTFile(BaseModel):
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
     cue_blocks: list[WebVTTCueBlock]
-    title: Optional[str] = None
+    title: str | None = None
 
     @staticmethod
     def verify_signature(content: str) -> bool:

From 55fb835f06d0e3578c7527c825e6adb57cbeb37c Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 26 Jan 2026 00:10:56 +0100
Subject: [PATCH 20/20] refactor(webvtt): simplify TrackProvenance model with
 tags

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py |  47 +++++---
 docling_core/types/doc/document.py           |  39 +++---
 docling_core/types/doc/webvtt.py             |  12 +-
 docs/DoclingDocument.json                    | 118 ++++++++++++++-----
 test/data/doc/webvtt_example_01.json         |  91 ++++++++++++--
 test/data/doc/webvtt_example_02.json         |  53 +++++++--
 test/data/doc/webvtt_example_03.json         | 112 +++++++++++++++---
 test/data/doc/webvtt_example_04.json         |  16 ++-
 test/data/doc/webvtt_example_05.json         |  23 +++-
 test/test_doc_base.py                        |  32 +++--
 10 files changed, 407 insertions(+), 136 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index bbc6e344..eba06b36 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -152,9 +152,7 @@ def serialize(
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
-            voice=prov.voice,
-            languages=prov.languages,
-            classes=prov.classes,
+            tags=prov.tags,
         )
         if is_inline_scope:
             # Iteratively remove unnecessary consecutive tag pairs until no more changes
@@ -394,7 +392,7 @@ def serialize_cue_span(
         text: str,
         tag: START_TAG_NAMES,
         anno: str | None = None,
-        css: list[str] = [],
+        css: list[str] | None = None,
     ) -> str:
         """Apply serialization to a WebVTT cue span."""
         start_tag: WebVTTCueSpanStartTag
@@ -504,9 +502,7 @@ def post_process(
         self,
         text: str,
         formatting: Formatting | None = None,
-        voice: str | None = None,
-        languages: list[str] | None = None,
-        classes: list[str] | None = None,
+        tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None,
         **kwargs: Any,
     ) -> str:
         """Apply some text post-processing steps by adding formatting tags.
@@ -521,25 +517,40 @@ def post_process(
             6. voice (<v>)
         """
         res: str = text
-        cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
-
-        for lang in languages or []:
-            res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", []))
-
-        res = super().post_process(text=res, formatting=formatting, classes=cls)
-
-        if "c" in cls:
+        # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
+
+        languages: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang"
+        ]
+        for lang in languages:
+            res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes)
+
+        format_classes = {
+            item.name: item.classes
+            for item in tags or []
+            if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"}
+        }
+        res = super().post_process(text=res, formatting=formatting, classes=format_classes)
+
+        class_tag: list[WebVTTCueSpanStartTag] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c"
+        ]
+        if class_tag:
             res = self.serialize_cue_span(
                 text=res,
                 tag="c",
-                css=cls.get("c", []),
+                css=class_tag[0].classes,
             )
+
+        voice: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v"
+        ]
         if voice:
             res = self.serialize_cue_span(
                 text=res,
                 tag="v",
-                anno=voice,
-                css=cls.get("v", []),
+                anno=voice[0].annotation,
+                css=voice[0].classes,
             )
 
         return res
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index e864574f..a9dd4aa8 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -65,6 +65,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.webvtt import WebVTTCueIdentifier, WebVTTCueSpanStartTag, WebVTTCueSpanStartTagAnnotated
 
 _logger = logging.getLogger(__name__)
 
@@ -1187,8 +1188,8 @@ class TrackProvenance(BaseProvenance):
     block, an audio clip, or a timed marker in a screen-recording.
     """
 
+    model_config = ConfigDict(regex_engine="python-re")
     kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
-
     start_time: Annotated[
         float,
         Field(
@@ -1203,27 +1204,19 @@ class TrackProvenance(BaseProvenance):
             description="End time offset of the track cue in seconds",
         ),
     ]
-    identifier: Optional[str] = Field(
-        None,
-        examples=["test", "123", "b72d946"],
-        description="An identifier of the cue",
-    )
-    voice: Optional[str] = Field(
-        None,
-        examples=["Mary", "Fred", "Name Surname"],
-        description="The cue voice (speaker)",
-    )
-    languages: Optional[list[str]] = Field(
-        None,
-        examples=[["en", "en-GB"], ["fr-CA"]],
-        description="Languages of the cue in BCP 47 language tag format",
-    )
-    classes: Optional[list[str]] = Field(
-        None,
-        min_length=1,
-        examples=["b.first", "v.loud", "c.yellow"],
-        description="Classes for describing the cue significance",
-    )
+    identifier: Annotated[
+        WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"])
+    ] = None
+    tags: Annotated[
+        list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None,
+        Field(
+            description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
+            examples=[
+                [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")],
+                [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])],
+            ],
+        ),
+    ] = None
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
@@ -1406,7 +1399,7 @@ class PictureMeta(FloatingMeta):
     tabular_chart: Optional[TabularChartMetaField] = None
 
 
-class NodeItem(BaseModel):
+class NodeItem(BaseModel, validate_assignment=True):
     """NodeItem."""
 
     self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index f7c4eea6..32bfc12d 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -233,19 +233,19 @@ class WebVTTCueSpanStartTag(BaseModel):
 
     name: Annotated[START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
-        list[str],
+        list[str] | None,
         Field(description="List of classes representing the cue span's significance"),
-    ] = []
+    ] = None
 
     @field_validator("classes", mode="after")
     @classmethod
-    def validate_classes(cls, value: list[str]) -> list[str]:
+    def validate_classes(cls, value: list[str] | None) -> list[str] | None:
         """Validate cue span start tag classes."""
-        for item in value:
+        for item in value or []:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                 raise ValueError("A cue span start tag class contains invalid characters")
             if not item:
-                raise ValueError("Cue span start tag classes cannot be empty")
+                raise ValueError("A cue span start tag class cannot be empty")
         return value
 
     def _get_name_with_classes(self) -> str:
@@ -501,7 +501,7 @@ def parse(cls, raw: str) -> Self:
                             raise ValueError(f"Incorrect end tag: {ct}")
                         class_string = closed["class"]
                         annotation = closed["annotation"]
-                        classes: list[str] = []
+                        classes: list[str] | None = None
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
                         st: WebVTTCueSpanStartTag
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index b37260eb..6b617f28 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -3224,6 +3224,7 @@
         "identifier": {
           "anyOf": [
             {
+              "pattern": "^(?!.*-->)[^\\n\\r]+$",
               "type": "string"
             },
             {
@@ -3239,25 +3240,73 @@
           ],
           "title": "Identifier"
         },
-        "voice": {
+        "tags": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "anyOf": [
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTag"
+                  },
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated"
+                  }
+                ]
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "The cue voice (speaker)",
+          "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
           "examples": [
-            "Mary",
-            "Fred",
-            "Name Surname"
+            [
+              {
+                "annotation": "John",
+                "classes": [
+                  "loud"
+                ],
+                "name": "v"
+              }
+            ],
+            [
+              {
+                "classes": [
+                  "foreignphrase"
+                ],
+                "name": "i"
+              }
+            ]
+          ],
+          "title": "Tags"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "TrackProvenance",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTag": {
+      "description": "WebVTT cue span start tag.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
           ],
-          "title": "Voice"
+          "title": "Name",
+          "type": "string"
         },
-        "languages": {
+        "classes": {
           "anyOf": [
             {
               "items": {
@@ -3270,17 +3319,31 @@
             }
           ],
           "default": null,
-          "description": "Languages of the cue in BCP 47 language tag format",
-          "examples": [
-            [
-              "en",
-              "en-GB"
-            ],
-            [
-              "fr-CA"
-            ]
+          "description": "List of classes representing the cue span's significance",
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "name"
+      ],
+      "title": "WebVTTCueSpanStartTag",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTagAnnotated": {
+      "description": "WebVTT cue span start tag requiring an annotation.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
           ],
-          "title": "Languages"
+          "title": "Name",
+          "type": "string"
         },
         "classes": {
           "anyOf": [
@@ -3288,7 +3351,6 @@
               "items": {
                 "type": "string"
               },
-              "minItems": 1,
               "type": "array"
             },
             {
@@ -3296,20 +3358,20 @@
             }
           ],
           "default": null,
-          "description": "Classes for describing the cue significance",
-          "examples": [
-            "b.first",
-            "v.loud",
-            "c.yellow"
-          ],
+          "description": "List of classes representing the cue span's significance",
           "title": "Classes"
+        },
+        "annotation": {
+          "description": "Cue span start tag annotation",
+          "title": "Annotation",
+          "type": "string"
         }
       },
       "required": [
-        "start_time",
-        "end_time"
+        "name",
+        "annotation"
       ],
-      "title": "TrackProvenance",
+      "title": "WebVTTCueSpanStartTagAnnotated",
       "type": "object"
     }
   },
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
index 78ce13b6..85d119be 100644
--- a/test/data/doc/webvtt_example_01.json
+++ b/test/data/doc/webvtt_example_01.json
@@ -76,7 +76,12 @@
           "kind": "track",
           "start_time": 11.0,
           "end_time": 13.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "We are in New York City",
@@ -95,7 +100,12 @@
           "kind": "track",
           "start_time": 13.0,
           "end_time": 16.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "We’re actually at the Lucern Hotel, just down the street",
@@ -114,7 +124,12 @@
           "kind": "track",
           "start_time": 16.0,
           "end_time": 18.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "from the American Museum of Natural History",
@@ -133,7 +148,12 @@
           "kind": "track",
           "start_time": 18.0,
           "end_time": 20.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "And with me is Neil deGrasse Tyson",
@@ -152,7 +172,12 @@
           "kind": "track",
           "start_time": 20.0,
           "end_time": 22.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "Astrophysicist, Director of the Hayden Planetarium",
@@ -171,7 +196,12 @@
           "kind": "track",
           "start_time": 22.0,
           "end_time": 24.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "at the AMNH.",
@@ -190,7 +220,12 @@
           "kind": "track",
           "start_time": 24.0,
           "end_time": 26.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "Thank you for walking down here.",
@@ -209,7 +244,12 @@
           "kind": "track",
           "start_time": 27.0,
           "end_time": 30.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "And I want to do a follow-up on the last conversation we did.",
@@ -228,7 +268,12 @@
           "kind": "track",
           "start_time": 30.0,
           "end_time": 31.5,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "When we e-mailed—",
@@ -247,7 +292,12 @@
           "kind": "track",
           "start_time": 30.5,
           "end_time": 32.5,
-          "voice": "Neil deGrasse Tyson"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
         }
       ],
       "orig": "Didn’t we talk about enough in that conversation?",
@@ -266,7 +316,12 @@
           "kind": "track",
           "start_time": 32.0,
           "end_time": 35.5,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "No! No no no no; 'cos 'cos obviously 'cos",
@@ -285,7 +340,12 @@
           "kind": "track",
           "start_time": 32.5,
           "end_time": 33.5,
-          "voice": "Neil deGrasse Tyson"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
         }
       ],
       "orig": "Laughs",
@@ -311,7 +371,12 @@
           "kind": "track",
           "start_time": 35.5,
           "end_time": 38.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "You know I’m so excited my glasses are falling off here.",
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
index 35c53692..55fd15ea 100644
--- a/test/data/doc/webvtt_example_02.json
+++ b/test/data/doc/webvtt_example_02.json
@@ -93,9 +93,15 @@
           "kind": "track",
           "start_time": 0.0,
           "end_time": 2.0,
-          "voice": "Esme",
-          "classes": [
-            "v.first.loud"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme",
+              "classes": [
+                "first",
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -115,7 +121,12 @@
           "kind": "track",
           "start_time": 2.0,
           "end_time": 4.0,
-          "voice": "Mary"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary"
+            }
+          ]
         }
       ],
       "orig": "No way!",
@@ -134,7 +145,12 @@
           "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0,
-          "voice": "Esme"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme"
+            }
+          ]
         }
       ],
       "orig": "Hee!",
@@ -196,9 +212,14 @@
           "kind": "track",
           "start_time": 6.0,
           "end_time": 8.0,
-          "voice": "Mary",
-          "classes": [
-            "v.loud"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -236,11 +257,17 @@
           "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0,
-          "languages": [
-            "en"
-          ],
-          "classes": [
-            "i.foreignphrase"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "en"
+            },
+            {
+              "name": "i",
+              "classes": [
+                "foreignphrase"
+              ]
+            }
           ]
         }
       ],
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
index 42d9e5b2..7b6faa6c 100644
--- a/test/data/doc/webvtt_example_03.json
+++ b/test/data/doc/webvtt_example_03.json
@@ -89,7 +89,12 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "OK,",
@@ -109,7 +114,12 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I think now we should be recording",
@@ -129,7 +139,12 @@
           "start_time": 8.571,
           "end_time": 9.403,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "properly.",
@@ -168,7 +183,12 @@
           "start_time": 13.363,
           "end_time": 13.803,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "Yeah.",
@@ -188,7 +208,12 @@
           "start_time": 49.603,
           "end_time": 53.363,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "I was also thinking.",
@@ -208,7 +233,12 @@
           "start_time": 54.963,
           "end_time": 62.072,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "Would be maybe good to create items,",
@@ -228,7 +258,12 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "some metadata,",
@@ -248,7 +283,12 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "some options that can be specific.",
@@ -268,7 +308,12 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "Yeah,",
@@ -288,7 +333,12 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I mean I think you went even more than",
@@ -308,7 +358,12 @@
           "start_time": 70.563,
           "end_time": 72.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "But we preserved the atoms.",
@@ -328,7 +383,12 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "than me.",
@@ -348,7 +408,12 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I just opened the format.",
@@ -368,7 +433,12 @@
           "start_time": 110.222,
           "end_time": 111.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "give it a try, yeah.",
@@ -388,7 +458,12 @@
           "start_time": 112.043,
           "end_time": 115.043,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "Okay, talk to you later.",
@@ -408,7 +483,12 @@
           "start_time": 114.603,
           "end_time": 115.283,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "See you.",
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
index 7e12385d..98e7da21 100644
--- a/test/data/doc/webvtt_example_04.json
+++ b/test/data/doc/webvtt_example_04.json
@@ -139,8 +139,13 @@
           "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
-          "classes": [
-            "b.loud"
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -185,7 +190,12 @@
           "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
-          "voice": "John"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "John"
+            }
+          ]
         }
       ],
       "orig": "This is true.",
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
index 9a53b3b0..4af18174 100644
--- a/test/data/doc/webvtt_example_05.json
+++ b/test/data/doc/webvtt_example_05.json
@@ -164,8 +164,11 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "languages": [
-            "es-ES"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "es-ES"
+            }
           ]
         }
       ],
@@ -231,8 +234,13 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "classes": [
-            "b.loud"
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -286,8 +294,11 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "languages": [
-            "it"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "it"
+            }
           ]
         }
       ],
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 45a9445c..5d569716 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc import TrackProvenance
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -40,34 +40,40 @@ def test_prov():
         Prov(**prov)
 
 
-def test_prov_track():
+def test_track_provenance():
     """Test the class TrackProvenance."""
 
     valid_track = TrackProvenance(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
-        voice="Mary",
-        languages=["en", "en-GB"],
-        classes=["v.first.loud", "i.foreignphrase"],
+        tags = [
+            {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]},
+            {"name": "lang", "annotation": "en"},
+            {"name": "lang", "annotation": "en-GB"},
+            {"name": "i", "classes": ["foreignphrase"]},
+        ]
     )
 
     assert valid_track
     assert valid_track.start_time == 11.0
     assert valid_track.end_time == 12.0
     assert valid_track.identifier == "test"
-    assert valid_track.voice == "Mary"
-    assert valid_track.languages == ["en", "en-GB"]
-    assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
+    assert valid_track.tags
+    assert valid_track.tags[0].annotation == "Mary"
+    assert valid_track.tags[0].classes == ["first", "loud"]
+    assert valid_track.tags[1].annotation == "en"
+    assert valid_track.tags[2].annotation == "en-GB"
+    assert valid_track.tags[3].classes == ["foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
         TrackProvenance(start_time=11.0)
 
-    with pytest.raises(ValidationError, match="should be a valid list"):
+    with pytest.raises(ValidationError, match="should be a valid dictionary"):
         TrackProvenance(
             start_time=11.0,
             end_time=12.0,
-            languages="en",
+            tags=["en"],
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
@@ -75,3 +81,9 @@ def test_prov_track():
             start_time=11.0,
             end_time=11.0,
         )
+
+    doc = DoclingDocument(name="Unknown")
+    item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT)
+    item.source = [valid_track]
+    with pytest.raises(ValidationError, match="should be a valid list"):
+        item.source = "Invalid source"