diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
index 385aca6a..1addf996 100644
--- a/docling_core/transforms/serializer/azure.py
+++ b/docling_core/transforms/serializer/azure.py
@@ -44,9 +44,10 @@
     DocSerializer,
     create_ser_result,
 )
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import (
+from docling_core.types.doc import (
+    CoordOrigin,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FormItem,
     InlineGroup,
@@ -59,7 +60,6 @@
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
 
 
 def _bbox_to_polygon_coords(
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 3a8ad71c..43bfd54b 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -35,11 +35,11 @@
     SerializationResult,
     Span,
 )
-from docling_core.types.doc.document import (
-    DOCUMENT_TOKENS_EXPORT_LABELS,
+from docling_core.types.doc import (
     ContentLayer,
     DescriptionAnnotation,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FloatingItem,
     Formatting,
@@ -57,7 +57,7 @@
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS
 
 _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
 _DEFAULT_LAYERS = set(ContentLayer)
@@ -317,7 +317,7 @@ def serialize_doc(
         parts: list[SerializationResult],
         **kwargs: Any,
     ) -> SerializationResult:
-        """Serialize a document out of its pages."""
+        """Serialize a document out of its parts."""
         ...
 
     def _serialize_body(self, **kwargs) -> SerializationResult:
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index e5672638..dc8c520f 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -26,11 +26,13 @@
     _should_use_legacy_annotations,
     create_ser_result,
 )
-from docling_core.types.doc.base import BoundingBox
 from docling_core.types.doc.document import (
+    BoundingBox,
     CodeItem,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
+    DocumentToken,
     FloatingItem,
     FormItem,
     GroupItem,
@@ -40,6 +42,7 @@
     ListItem,
     NodeItem,
     PictureClassificationData,
+    PictureClassificationLabel,
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
@@ -47,10 +50,9 @@
     SectionHeaderItem,
     TableData,
     TableItem,
+    TableToken,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
-from docling_core.types.doc.tokens import DocumentToken, TableToken
 
 
 def _wrap(text: str, wrap_tag: str) -> str:
diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
new file mode 100644
index 00000000..eba06b36
--- /dev/null
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -0,0 +1,556 @@
+"""Define classes for WebVTT serialization."""
+
+import logging
+import re
+from typing import Any, get_args
+
+from pydantic import BaseModel
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
+    BaseDocSerializer,
+    BaseFallbackSerializer,
+    BaseFormSerializer,
+    BaseInlineSerializer,
+    BaseKeyValueSerializer,
+    BaseListSerializer,
+    BaseMetaSerializer,
+    BasePictureSerializer,
+    BaseTableSerializer,
+    BaseTextSerializer,
+    SerializationResult,
+)
+from docling_core.transforms.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    create_ser_result,
+)
+from docling_core.types.doc.document import (
+    ContentLayer,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    Formatting,
+    FormItem,
+    InlineGroup,
+    KeyValueItem,
+    ListGroup,
+    NodeItem,
+    PictureItem,
+    TableItem,
+    TextItem,
+    TitleItem,
+    TrackProvenance,
+)
+from docling_core.types.doc.webvtt import (
+    START_TAG_NAMES,
+    WebVTTCueBlock,
+    WebVTTCueSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTimings,
+    WebVTTFile,
+    WebVTTLineTerminator,
+    WebVTTTimestamp,
+)
+
+_logger = logging.getLogger(__name__)
+
+
+def _remove_consecutive_pairs(text: str) -> str:
+    """Remove one pass of consecutive start/end tag pairs.
+
+    This function looks for patterns like </tag><tag> where the tags are identical
+    and removes them. It handles two cases:
+    1. Direct adjacent tags with content: <tag>content</tag>whitespace<tag>
+    2. Tags with other tags in between: </tag><othertag><tag>
+
+    Args:
+        text: Input string
+
+    Returns:
+        String with one pass of consecutive pairs removed
+    """
+    # Pattern 1: Direct adjacent tags </tag><tag> with same classes and annotations
+    pattern1 = re.compile(
+        r"<([bciuv]|lang)((?:\.\w+)*)(?:\s+([^>]+))?>"  # Opening tag: capture tag, classes, annotation
+        r"((?:(?!</\1>).)*?)"  # Content (non-greedy, not containing the closing tag)
+        r"</\1>"  # Closing tag
+        r"(\s*)"  # Capture whitespace between tags (including newlines)
+        r"<\1((?:\.\w+)*)(?:\s+([^>]+))?>"  # Next opening tag: capture classes and annotation
+    )
+
+    def replacer1(match: re.Match[str]) -> str:
+        tag = match.group(1)
+        classes1 = match.group(2) or ""
+        anno1 = match.group(3) or ""
+        content = match.group(4)
+        whitespace = match.group(5)  # Whitespace between tags
+        classes2 = match.group(6) or ""
+        anno2 = match.group(7) or ""
+
+        # Only merge if classes and annotations match
+        if classes1 == classes2 and anno1 == anno2:
+            # Merge: remove the closing and opening tags, but keep the whitespace
+            return f"<{tag}{classes1}{' ' + anno1 if anno1 else ''}>{content}{whitespace}"
+        else:
+            # Don't merge - return original
+            return match.group(0)
+
+    # Pattern 2: Tags with other tags in between </tag><othertag><tag>
+    # This removes redundant </tag> and <tag> when there's another tag in between
+    pattern2 = re.compile(
+        r"</([bciuv]|lang)>"  # Closing tag
+        r"(<[^>]+>)"  # Any other tag in between
+        r"<\1(?:\.\w+)*(?:\s+[^>]+)?>"  # Same opening tag (with any classes/annotations)
+    )
+
+    def replacer2(match: re.Match[str]) -> str:
+        # Just keep the middle tag, remove the closing and opening of the same type
+        return match.group(2)
+
+    result = pattern1.sub(replacer1, text)
+    result = pattern2.sub(replacer2, result)
+
+    return result
+
+
+class WebVTTParams(CommonParams):
+    """Serialization parameters for the Web Video Text Tracks (WebVTT) format."""
+
+    layers: set[ContentLayer] = {ContentLayer.BODY}
+
+
+class WebVTTTextSerializer(BaseModel, BaseTextSerializer):
+    """Text serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TextItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        is_inline_scope: bool = False,
+        visited: set[str] | None = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        # Handle TitleItem specially - it doesn't have provenance but we need its text
+        if isinstance(item, TitleItem):
+            return create_ser_result(text=item.text, span_source=item)
+
+        # Only process items with TrackProvenance (WebVTT cues)
+        if not item.text or not item.source or item.source[0].kind != "track":
+            return create_ser_result()
+
+        # Apply post-processing here: formatting, classes, language, and voice
+        # If the TextItem is part of an InlineGroup, we need to further post-process it
+        # within the group context
+
+        prov: TrackProvenance = item.source[0]
+        text: str = doc_serializer.post_process(
+            text=item.text,
+            formatting=item.formatting,
+            tags=prov.tags,
+        )
+        if is_inline_scope:
+            # Iteratively remove unnecessary consecutive tag pairs until no more changes
+            prev_text: str | None = None
+            while prev_text != text:
+                prev_text = text
+                text = _remove_consecutive_pairs(text)
+
+        return create_ser_result(text=text, span_source=item)
+
+
+class _WebVTTTableSerializer(BaseTableSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTPictureSerializer(BasePictureSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: PictureItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTKeyValueSerializer(BaseKeyValueSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFormSerializer(BaseFormSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: FormItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFallbackSerializer(BaseFallbackSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTListSerializer(BaseModel, BaseListSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: ListGroup,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        list_level: int = 0,
+        is_inline_scope: bool = False,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs)
+        return create_ser_result()
+
+
+class WebVTTInlineSerializer(BaseInlineSerializer):
+    """Inline group serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: InlineGroup,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        list_level: int = 0,
+        visited: set[str] | None = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes an inline group to WebVTT format."""
+        _ = doc
+        my_visited = visited if visited is not None else set()
+        parts = doc_serializer.get_parts(
+            item=item,
+            list_level=list_level,
+            is_inline_scope=True,
+            visited=my_visited,
+            **kwargs,
+        )
+        # Include all parts, even if text is empty or whitespace-only
+        # Use 'is not None' instead of truthiness check to preserve whitespace
+        text_res = "".join([p.text for p in parts if p.text is not None])
+
+        # Apply tag normalization to the concatenated result
+        # Iteratively remove consecutive pairs until no more changes
+        prev_text = None
+        while prev_text != text_res:
+            prev_text = text_res
+            text_res = _remove_consecutive_pairs(text_res)
+
+        return create_ser_result(text=text_res, span_source=parts)
+
+
+class _WebVTTMetaSerializer(BaseModel, BaseMetaSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class WebVTTDocSerializer(DocSerializer):
+    """Document serializer to Web Video Text Tracks (WebVTT) format."""
+
+    text_serializer: BaseTextSerializer = WebVTTTextSerializer()
+    table_serializer: BaseTableSerializer = _WebVTTTableSerializer()
+    picture_serializer: BasePictureSerializer = _WebVTTPictureSerializer()
+    key_value_serializer: BaseKeyValueSerializer = _WebVTTKeyValueSerializer()
+    form_serializer: BaseFormSerializer = _WebVTTFormSerializer()
+    fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer()
+    list_serializer: BaseListSerializer = _WebVTTListSerializer()
+    inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer()
+    meta_serializer: BaseMetaSerializer | None = _WebVTTMetaSerializer()
+    annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer()
+
+    params: CommonParams = CommonParams()
+
+    @override
+    def requires_page_break(self) -> bool:
+        """Whether to add page breaks.
+
+        WebVTT format does not support page breaks.
+        """
+        return False
+
+    @override
+    def serialize_bold(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific bold serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("b", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="b",
+            css=classes,
+        )
+
+    @override
+    def serialize_italic(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific italic serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("i", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="i",
+            css=classes,
+        )
+
+    @override
+    def serialize_underline(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific underline serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("u", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="u",
+            css=classes,
+        )
+
+    def serialize_cue_span(
+        self,
+        text: str,
+        tag: START_TAG_NAMES,
+        anno: str | None = None,
+        css: list[str] | None = None,
+    ) -> str:
+        """Apply serialization to a WebVTT cue span."""
+        start_tag: WebVTTCueSpanStartTag
+        if tag in {"b", "i", "u", "c"}:
+            start_tag = WebVTTCueSpanStartTag(name=tag, classes=css)
+        elif tag in {"v", "lang"}:
+            if not anno:
+                _logger.warning(f"Invalid {tag} cue span without annotation: {text}")
+                return text
+            else:
+                start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno)
+        else:
+            return text
+
+        res: str = f"{start_tag}{text}</{tag}>"
+        return res
+
+    @staticmethod
+    def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
+        """Extract tag and values from provenance classes.
+
+        Args:
+            classes: The classes from a TrackProvenance object.
+
+        Returns:
+            Map of tag to class values.
+        """
+        res: dict[str, list[str]] = {}
+        for item in classes or []:
+            for prefix in get_args(START_TAG_NAMES):
+                if item == prefix:
+                    res[prefix] = []
+                    break
+                elif item.startswith(prefix + "."):
+                    cls_str: str = item[len(prefix) + 1 :]
+                    res[prefix] = cls_str.split(".")
+                    break
+        return res
+
+    @override
+    def serialize_doc(
+        self,
+        *,
+        parts: list[SerializationResult],
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize a document out of its parts."""
+        title: str | None = None
+
+        timings: WebVTTCueTimings | None = None
+        id: str | None = None
+        text: str = ""
+        cue_blocks: list[WebVTTCueBlock] = []
+        for part in parts:
+            if not part.text or not part.spans:
+                continue
+
+            # Get the doc item from the first span
+            doc_item: DocItem = part.spans[0].item
+
+            # Handle title items (check both TitleItem type and label)
+            if isinstance(doc_item, TitleItem) or (
+                isinstance(doc_item, TextItem) and doc_item.label == DocItemLabel.TITLE
+            ):
+                title = part.text
+                continue
+            if isinstance(doc_item, InlineGroup) and doc_item.children:
+                doc_item = doc_item.children[0].resolve(doc=self.doc)
+            if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track":
+                prov: TrackProvenance = doc_item.source[0]
+                if (
+                    prov.identifier == id
+                    and timings
+                    and timings.start.seconds == prov.start_time
+                    and timings.end.seconds == prov.end_time
+                ):
+                    # When combining items with same timing, add newline and merge consecutive tags
+                    combined = text.rstrip() + WebVTTLineTerminator.LF.value + part.text
+                    # Use _remove_consecutive_pairs to merge tags like </v>\n<v Speaker A>
+                    # Iteratively remove consecutive pairs until no more changes
+                    prev_combined = None
+                    while prev_combined != combined:
+                        prev_combined = combined
+                        combined = _remove_consecutive_pairs(combined)
+                    text = combined + WebVTTLineTerminator.LF.value
+                else:
+                    if text:
+                        cue_blocks.append(WebVTTCueBlock.parse(text))
+                    timings = WebVTTCueTimings(
+                        start=WebVTTTimestamp.from_seconds(prov.start_time),
+                        end=WebVTTTimestamp.from_seconds(prov.end_time),
+                    )
+                    id = prov.identifier
+                    text = (
+                        f"{id + WebVTTLineTerminator.LF.value if id else ''}{timings}"
+                        f"{WebVTTLineTerminator.LF.value}{part.text}"
+                        f"{WebVTTLineTerminator.LF.value}"
+                    )
+        if text:
+            cue_blocks.append(WebVTTCueBlock.parse(text))
+
+        webvtt_file = WebVTTFile(title=title, cue_blocks=cue_blocks)
+        content = str(webvtt_file)
+        return create_ser_result(text=content, span_source=parts)
+
+    def post_process(
+        self,
+        text: str,
+        formatting: Formatting | None = None,
+        tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None,
+        **kwargs: Any,
+    ) -> str:
+        """Apply some text post-processing steps by adding formatting tags.
+
+        The order of the formatting tags is determined by this function and `DocSerializer.post_process`,
+        from the innermost to the outermost:
+            1. language (<lang>)
+            2. underline (<u>)
+            3. italic (<i>)
+            4. bold (<b>)
+            5. class (<c>)
+            6. voice (<v>)
+        """
+        res: str = text
+        # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
+
+        languages: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang"
+        ]
+        for lang in languages:
+            res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes)
+
+        format_classes = {
+            item.name: item.classes
+            for item in tags or []
+            if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"}
+        }
+        res = super().post_process(text=res, formatting=formatting, classes=format_classes)
+
+        class_tag: list[WebVTTCueSpanStartTag] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c"
+        ]
+        if class_tag:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="c",
+                css=class_tag[0].classes,
+            )
+
+        voice: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v"
+        ]
+        if voice:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="v",
+                anno=voice[0].annotation,
+                css=voice[0].classes,
+            )
+
+        return res
diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py
index 5ed7b843..89b07f77 100644
--- a/docling_core/transforms/visualizer/key_value_visualizer.py
+++ b/docling_core/transforms/visualizer/key_value_visualizer.py
@@ -16,8 +16,12 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument
-from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
+from docling_core.types.doc import (
+    ContentLayer,
+    DoclingDocument,
+    GraphCellLabel,
+    GraphLinkLabel,
+)
 
 # ---------------------------------------------------------------------------
 # Helper functions / constants
diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py
index 369a7b38..043fedac 100644
--- a/docling_core/transforms/visualizer/layout_visualizer.py
+++ b/docling_core/transforms/visualizer/layout_visualizer.py
@@ -10,10 +10,15 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc import DocItemLabel
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc import (
+    BoundingRectangle,
+    ContentLayer,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    TextCell,
+)
 
 
 class _TLBoundingRectangle(BoundingRectangle):
diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py
index 489a6d9a..5f601f9a 100644
--- a/docling_core/transforms/visualizer/table_visualizer.py
+++ b/docling_core/transforms/visualizer/table_visualizer.py
@@ -10,7 +10,7 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
+from docling_core.types.doc import ContentLayer, DoclingDocument, TableItem
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index 3c699f89..c3a2b237 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -46,6 +46,7 @@
     PictureClassificationClass,
     PictureClassificationData,
     PictureClassificationMetaField,
+    PictureClassificationPrediction,
     PictureDataType,
     PictureItem,
     PictureLineChartData,
@@ -56,17 +57,20 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
+    ProvenanceType,
     RefItem,
     RichTableCell,
     Script,
     SectionHeaderItem,
     SummaryMetaField,
+    TableAnnotationType,
     TableCell,
     TableData,
     TableItem,
     TabularChartMetaField,
     TextItem,
     TitleItem,
+    TrackProvenance,
     UnorderedList,
 )
 from .labels import (
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 0ecc3e51..a9dd4aa8 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -65,6 +65,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.webvtt import WebVTTCueIdentifier, WebVTTCueSpanStartTag, WebVTTCueSpanStartTagAnnotated
 
 _logger = logging.getLogger(__name__)
 
@@ -958,6 +959,7 @@ class DocumentOrigin(BaseModel):
         "text/asciidoc",
         "text/markdown",
         "text/csv",
+        "text/vtt",
         "audio/x-wav",
         "audio/wav",
         "audio/mp3",
@@ -1155,11 +1157,91 @@ def from_multipage_doctags_and_images(
 
 
 class ProvenanceItem(BaseModel):
-    """ProvenanceItem."""
+    """Provenance information for elements extracted from a textual document.
 
-    page_no: int
-    bbox: BoundingBox
-    charspan: tuple[int, int]
+    A `ProvenanceItem` object acts as a lightweight pointer back into the original
+    document for an extracted element. It applies to documents with an explicity
+    or implicit layout, such as PDF, HTML, docx, or pptx.
+    """
+
+    page_no: Annotated[int, Field(description="Page number")]
+    bbox: Annotated[BoundingBox, Field(description="Bounding box")]
+    charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
+
+
+class BaseProvenance(BaseModel):
+    """Base class for provenance information.
+
+    Represents the provenance of an extracted component within a digital asset.
+    """
+
+    kind: Annotated[
+        str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.")
+    ]
+
+
+class TrackProvenance(BaseProvenance):
+    """Provenance metadata for a cue extracted from a media track.
+
+    A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,
+    etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle
+    block, an audio clip, or a timed marker in a screen-recording.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+    kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
+    start_time: Annotated[
+        float,
+        Field(
+            examples=[11.0, 6.5, 5370.0],
+            description="Start time offset of the track cue in seconds",
+        ),
+    ]
+    end_time: Annotated[
+        float,
+        Field(
+            examples=[12.0, 8.2, 5370.1],
+            description="End time offset of the track cue in seconds",
+        ),
+    ]
+    identifier: Annotated[
+        WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"])
+    ] = None
+    tags: Annotated[
+        list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None,
+        Field(
+            description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
+            examples=[
+                [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")],
+                [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])],
+            ],
+        ),
+    ] = None
+
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        """Ensure start time is less than the end time."""
+        if self.end_time <= self.start_time:
+            raise ValueError("End time must be greater than start time")
+        return self
+
+
+ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")]
+"""Union type for all provenance types.
+
+This type alias represents a discriminated union of all available provenance types that can be associated with
+extracted elements in a document. The `kind` field is used as a discriminator to determine the specific
+provenance type at runtime.
+
+Currently supported provenance types:
+    - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles)
+
+Notes:
+    - Additional provenance types may be added to this union in the future to support
+        other content sources.
+    - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the
+        `ProvenanceItem` should still be used.
+"""
 
 
 class ContentLayer(str, Enum):
@@ -1317,7 +1399,7 @@ class PictureMeta(FloatingMeta):
     tabular_chart: Optional[TabularChartMetaField] = None
 
 
-class NodeItem(BaseModel):
+class NodeItem(BaseModel, validate_assignment=True):
     """NodeItem."""
 
     self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
@@ -1464,20 +1546,28 @@ class FineRef(RefItem):
     range: Optional[tuple[int, int]] = None  # start_inclusive, end_exclusive
 
 
-class DocItem(NodeItem):  # Base type for any element that carries content, can be a leaf node
-    """DocItem."""
+class DocItem(NodeItem):
+    """Base type for any element that carries content, can be a leaf node."""
 
     label: DocItemLabel
     prov: list[ProvenanceItem] = []
+    source: Annotated[
+        list[ProvenanceType],
+        Field(
+            description="The provenance of this document item. Currently, it is only used for media track provenance."
+        ),
+    ] = []
     comments: list[FineRef] = []  # References to comment items annotating this content
 
     @model_serializer(mode="wrap")
     def _custom_pydantic_serialize(self, handler: SerializerFunctionWrapHandler) -> dict:
         dumped = handler(self)
 
-        # suppress serializing comment list when empty:
-        if dumped.get("comments") == []:
-            del dumped["comments"]
+        # suppress serializing comment and source lists when empty:
+        for field in {"comments", "source"}:
+            if dumped.get(field) == []:
+                del dumped[field]
+
         return dumped
 
     def get_location_tokens(
@@ -1515,10 +1605,13 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if a valid image of the page containing this DocItem is not available
         in doc.
         """
-        if not len(self.prov):
+        if not self.prov or prov_index >= len(self.prov):
+            return None
+        prov = self.prov[prov_index]
+        if not isinstance(prov, ProvenanceItem):
             return None
 
-        page = doc.pages.get(self.prov[prov_index].page_no)
+        page = doc.pages.get(prov.page_no)
         if page is None or page.size is None or page.image is None:
             return None
 
@@ -4625,7 +4718,7 @@ def _with_pictures_refs(
         image_dir.mkdir(parents=True, exist_ok=True)
 
         if image_dir.is_dir():
-            for item, level in result.iterate_items(page_no=page_no, with_groups=False):
+            for item, _ in result.iterate_items(page_no=page_no, with_groups=False):
                 if isinstance(item, PictureItem):
                     img = item.get_image(doc=self)
                     if img is not None:
@@ -4647,7 +4740,8 @@ def _with_pictures_refs(
                             if item.image is None:
                                 scale = img.size[0] / item.prov[0].bbox.width
                                 item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale))
-                            item.image.uri = Path(obj_path)
+                            elif item.image is not None:
+                                item.image.uri = Path(obj_path)
 
                         # if item.image._pil is not None:
                         #    item.image._pil.close()
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
new file mode 100644
index 00000000..32bfc12d
--- /dev/null
+++ b/docling_core/types/doc/webvtt.py
@@ -0,0 +1,696 @@
+"""Models for the Docling's adoption of Web Video Text Tracks format."""
+
+import re
+import warnings
+from collections.abc import Iterator
+from enum import Enum
+from functools import total_ordering
+from typing import Annotated, ClassVar, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic.types import StringConstraints
+from typing_extensions import Self, override
+
+_VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
+_ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
+START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
+
+
+class WebVTTLineTerminator(str, Enum):
+    """WebVTT line terminator."""
+
+    CRLF = "\r\n"
+    LF = "\n"
+    CR = "\r"
+
+
+WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
+
+
+@total_ordering
+class WebVTTTimestamp(BaseModel):
+    """WebVTT timestamp.
+
+    The timestamp is a string consisting of the following components in the given order:
+
+    - hours (optional, required if non-zero): two or more digits
+    - minutes: two digits between 0 and 59
+    - a colon character (:)
+    - seconds: two digits between 0 and 59
+    - a full stop character (.)
+    - thousandths of a second: three digits
+
+    A WebVTT timestamp is always interpreted relative to the current playback position
+    of the media data that the WebVTT file is to be synchronized with.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    raw: Annotated[
+        str,
+        Field(description="A representation of the WebVTT Timestamp as a single string"),
+    ]
+
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$")
+    _hours: int
+    _minutes: int
+    _seconds: int
+    _millis: int
+
+    @model_validator(mode="after")
+    def validate_raw(self) -> Self:
+        """Validate the WebVTT timestamp as a string."""
+        m = self._pattern.match(self.raw)
+        if not m:
+            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
+        self._hours = int(m.group(1)) if m.group(1) else 0
+        self._minutes = int(m.group(2))
+        self._seconds = int(m.group(3))
+        self._millis = int(m.group(4))
+
+        if self._minutes < 0 or self._minutes > 59:
+            raise ValueError("Minutes must be between 0 and 59")
+        if self._seconds < 0 or self._seconds > 59:
+            raise ValueError("Seconds must be between 0 and 59")
+
+        return self
+
+    @property
+    def seconds(self) -> float:
+        """A representation of the WebVTT Timestamp in seconds."""
+        return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
+
+    @classmethod
+    def from_seconds(cls, seconds: float) -> Self:
+        """Create a WebVTT timestamp from seconds.
+
+        Args:
+            seconds: The time in seconds (can include fractional seconds for milliseconds).
+
+        Returns:
+            A WebVTT timestamp instance.
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis: int = round((seconds % 1) * 1000)
+
+        return cls(raw=f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}")
+
+    def __eq__(self, other: object) -> bool:
+        """Two timestamps are equal if their total number of seconds is equal."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds == other.seconds
+
+    def __lt__(self, other: "WebVTTTimestamp") -> bool:
+        """Return True if this timestamp occurs before `other`."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds < other.seconds
+
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the timestamp as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0.
+
+        Returns:
+            Formatted timestamp string.
+        """
+        if omit_hours_if_zero and self._hours == 0:
+            return f"{self._minutes:02d}:{self._seconds:02d}.{self._millis:03d}"
+        return self.raw
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of a WebVTT timestamp.
+
+        Always returns the full timestamp format including hours (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation (MM:SS.mmm) when hours are zero.
+        """
+        return self.raw
+
+
+class WebVTTCueTimings(BaseModel):
+    """WebVTT cue timings."""
+
+    start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")]
+    end: Annotated[WebVTTTimestamp, Field(description="End time offset of the cue")]
+
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        """Ensure start timestamp is less than end timestamp."""
+        if self.start and self.end:
+            if self.end <= self.start:
+                raise ValueError("End timestamp must be greater than start timestamp")
+        return self
+
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the cue timings as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in both timestamps.
+
+        Returns:
+            Formatted cue timings string in the format "start --> end".
+        """
+        start_str = self.start.format(omit_hours_if_zero=omit_hours_if_zero)
+        end_str = self.end.format(omit_hours_if_zero=omit_hours_if_zero)
+        return f"{start_str} --> {end_str}"
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue timings.
+
+        Always returns the full format including hours (HH:MM:SS.mmm --> HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
+        return f"{self.start} --> {self.end}"
+
+
+class WebVTTCueTextSpan(BaseModel):
+    """WebVTT cue text span."""
+
+    kind: Literal["text"] = "text"
+    text: Annotated[str, Field(description="The cue text.")]
+
+    @field_validator("text", mode="after")
+    @classmethod
+    def is_valid_text(cls, value: str) -> str:
+        """Ensure cue text contains only permitted characters and HTML entities."""
+        for match in _ENTITY_PATTERN.finditer(value):
+            entity = match.group(1)
+            if entity not in _VALID_ENTITIES:
+                raise ValueError(f"Cue text contains an invalid HTML entity: &{entity};")
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in the cue text")
+        if any(ch in value for ch in {"\n", "\r", "<"}):
+            raise ValueError("Cue text contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Cue text cannot be empty")
+
+        return value
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue text span."""
+        return self.text
+
+
+class WebVTTCueComponentWithTerminator(BaseModel):
+    """WebVTT caption or subtitle cue component optionally with a line terminator."""
+
+    component: "WebVTTCueComponent"
+    terminator: WebVTTLineTerminator | None = None
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue component with terminator."""
+        return f"{self.component}{self.terminator.value if self.terminator else ''}"
+
+
+class WebVTTCueInternalText(BaseModel):
+    """WebVTT cue internal text."""
+
+    terminator: WebVTTLineTerminator | None = None
+    components: Annotated[
+        list[WebVTTCueComponentWithTerminator],
+        Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
+    ] = []
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue internal text."""
+        cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
+        return cue_str
+
+
+class WebVTTCueSpanStartTag(BaseModel):
+    """WebVTT cue span start tag."""
+
+    name: Annotated[START_TAG_NAMES, Field(description="The tag name")]
+    classes: Annotated[
+        list[str] | None,
+        Field(description="List of classes representing the cue span's significance"),
+    ] = None
+
+    @field_validator("classes", mode="after")
+    @classmethod
+    def validate_classes(cls, value: list[str] | None) -> list[str] | None:
+        """Validate cue span start tag classes."""
+        for item in value or []:
+            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
+                raise ValueError("A cue span start tag class contains invalid characters")
+            if not item:
+                raise ValueError("A cue span start tag class cannot be empty")
+        return value
+
+    def _get_name_with_classes(self) -> str:
+        """Return the name of the cue span start tag with classes."""
+        return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue span start tag."""
+        return f"<{self._get_name_with_classes()}>"
+
+
+class WebVTTCueSpanStartTagAnnotated(WebVTTCueSpanStartTag):
+    """WebVTT cue span start tag requiring an annotation."""
+
+    annotation: Annotated[str, Field(description="Cue span start tag annotation")]
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def is_valid_annotation(cls, value: str) -> str:
+        """Ensure annotation contains only permitted characters and HTML entities."""
+        for match in _ENTITY_PATTERN.finditer(value):
+            entity = match.group(1)
+            if entity not in _VALID_ENTITIES:
+                raise ValueError(f"Annotation contains an invalid HTML entity: &{entity};")
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in annotation")
+        if any(ch in value for ch in {"\n", "\r", ">"}):
+            raise ValueError("Annotation contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Annotation cannot be empty")
+
+        return value
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue span start tag."""
+        return f"<{self._get_name_with_classes()} {self.annotation}>"
+
+
+class WebVTTCueLanguageSpanStartTag(WebVTTCueSpanStartTagAnnotated):
+    """WebVTT cue language span start tag."""
+
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE)
+
+    name: Literal["lang"] = Field("lang", description="The tag name")
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    @override
+    def is_valid_annotation(cls, value: str) -> str:
+        """Ensure that the language annotation is in BCP 47 language tag format."""
+        if cls._pattern.match(value):
+            return value
+        else:
+            raise ValueError("Annotation should be in BCP 47 language tag format")
+
+
+class WebVTTCueComponentBase(BaseModel):
+    """WebVTT caption or subtitle cue component.
+
+    All the WebVTT caption or subtitle cue components are represented by this class
+    except the WebVTT cue text span, which requires different definitions.
+    """
+
+    kind: Literal["c", "b", "i", "u", "v", "lang"]
+    start_tag: WebVTTCueSpanStartTag
+    internal_text: WebVTTCueInternalText
+
+    @model_validator(mode="after")
+    def check_tag_names_match(self) -> Self:
+        """Ensure that the start tag name matches this cue component type."""
+        if self.kind != self.start_tag.name:
+            raise ValueError("The tag name of this cue component should be {self.kind}")
+        return self
+
+    @override
+    def __str__(self) -> str:
+        """Return a string representation of the cue component."""
+        return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
+
+
+class WebVTTCueVoiceSpan(WebVTTCueComponentBase):
+    """WebVTT cue voice span associated with a specific voice."""
+
+    kind: Literal["v"] = "v"
+    start_tag: WebVTTCueSpanStartTagAnnotated
+
+
+class WebVTTCueClassSpan(WebVTTCueComponentBase):
+    """WebVTT cue class span.
+
+    It represents a span of text and it is used to annotate parts of the cue with
+    applicable classes without implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["c"] = "c"
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="c")
+
+
+class WebVTTCueItalicSpan(WebVTTCueComponentBase):
+    """WebVTT cue italic span representing a span of italic text."""
+
+    kind: Literal["i"] = "i"
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="i")
+
+
+class WebVTTCueBoldSpan(WebVTTCueComponentBase):
+    """WebVTT cue bold span representing a span of bold text."""
+
+    kind: Literal["b"] = "b"
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="b")
+
+
+class WebVTTCueUnderlineSpan(WebVTTCueComponentBase):
+    """WebVTT cue underline span representing a span of underline text."""
+
+    kind: Literal["u"] = "u"
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="u")
+
+
+class WebVTTCueLanguageSpan(WebVTTCueComponentBase):
+    """WebVTT cue language span.
+
+    It represents a span of text and it is used to annotate parts of the cue where the
+    applicable language might be different than the surrounding text's, without
+    implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["lang"] = "lang"
+    start_tag: WebVTTCueLanguageSpanStartTag
+
+
+WebVTTCueComponent = Annotated[
+    WebVTTCueTextSpan
+    | WebVTTCueClassSpan
+    | WebVTTCueItalicSpan
+    | WebVTTCueBoldSpan
+    | WebVTTCueUnderlineSpan
+    | WebVTTCueVoiceSpan
+    | WebVTTCueLanguageSpan,
+    Field(
+        discriminator="kind",
+        description="The type of WebVTT caption or subtitle cue component.",
+    ),
+]
+
+
+class WebVTTCueBlock(BaseModel):
+    """Model representing a WebVTT cue block.
+
+    The optional WebVTT cue settings list is not supported.
+    The cue payload is limited to the following spans: text, class, italic, bold,
+    underline, and voice.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    identifier: Annotated[WebVTTCueIdentifier | None, Field(description="The WebVTT cue identifier")] = None
+    timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    payload: Annotated[
+        list[WebVTTCueComponentWithTerminator],
+        Field(description="The WebVTT caption or subtitle cue text"),
+    ]
+
+    # pattern of a WebVTT cue span start/end tag
+    _pattern_tag: ClassVar[re.Pattern] = re.compile(
+        r"<(?P<end>/?)"
+        r"(?P<tag>i|b|c|u|v|lang)"
+        r"(?P<class>(?:\.[^\t\n\r &<>.]+)*)"
+        r"(?:[ \t](?P<annotation>[^\n\r&>]*))?>"
+    )
+
+    @field_validator("payload", mode="after")
+    @classmethod
+    def validate_payload(cls, payload):
+        """Ensure that the cue payload contains valid text."""
+        for voice in payload:
+            if "-->" in str(voice):
+                raise ValueError("Cue payload must not contain '-->'")
+        return payload
+
+    @staticmethod
+    def _create_text_components(
+        text: str,
+    ) -> Iterator[WebVTTCueComponentWithTerminator]:
+        text_list = text.split("\n")
+        for idx, line in enumerate(text.split("\n")):
+            terminator = WebVTTLineTerminator.LF if idx < len(text_list) - 1 or text.endswith("\n") else None
+            if len(line) > 0:
+                yield WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text=line),
+                    terminator=terminator,
+                )
+
+    @classmethod
+    def parse(cls, raw: str) -> Self:
+        """Parse a WebVTT cue block from a string.
+
+        Args:
+            raw: The raw WebVTT cue block string.
+
+        Returns:
+            The parsed WebVTT cue block.
+        """
+        lines = raw.strip().splitlines()
+        if not lines:
+            raise ValueError("Cue block must have at least one line")
+        identifier: WebVTTCueIdentifier | None = None
+        timing_line = lines[0]
+        if "-->" not in timing_line and len(lines) > 1:
+            identifier = timing_line
+            timing_line = lines[1]
+            cue_lines = lines[2:]
+        else:
+            cue_lines = lines[1:]
+
+        if "-->" not in timing_line:
+            raise ValueError("Cue block must contain WebVTT cue timings")
+
+        start, end = [t.strip() for t in timing_line.split("-->")]
+        end = re.split(" |\t", end)[0]  # ignore the cue settings list
+        timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
+        cue_text = "\n".join(cue_lines).strip()
+        # adding close tag for cue spans without end tag
+        for omm in {"v"}:
+            if cue_text.startswith(f"<{omm}") and f"</{omm}>" not in cue_text:
+                cue_text += f"</{omm}>"
+                break
+
+        stack: list[list[WebVTTCueComponentWithTerminator]] = [[]]
+        tag_stack: list[dict] = []
+
+        pos = 0
+        matches = list(cls._pattern_tag.finditer(cue_text))
+        i = 0
+        while i < len(matches):
+            match = matches[i]
+            if match.start() > pos:
+                text = cue_text[pos : match.start()]
+                stack[-1].extend(cls._create_text_components(text))
+            gps = {k: (v if v else None) for k, v in match.groupdict().items()}
+
+            if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}:
+                if not gps["end"]:
+                    tag_stack.append(gps)
+                    stack.append([])
+                else:
+                    children = stack.pop() if stack else []
+                    if tag_stack:
+                        closed = tag_stack.pop()
+                        if (ct := closed["tag"]) != gps["tag"]:
+                            raise ValueError(f"Incorrect end tag: {ct}")
+                        class_string = closed["class"]
+                        annotation = closed["annotation"]
+                        classes: list[str] | None = None
+                        if class_string:
+                            classes = [c for c in class_string.split(".") if c]
+                        st: WebVTTCueSpanStartTag
+                        if annotation and ct == "lang":
+                            st = WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
+                        elif annotation:
+                            st = WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
+                        else:
+                            st = WebVTTCueSpanStartTag(name=ct, classes=classes)
+                        it = WebVTTCueInternalText(components=children)
+                        cp: WebVTTCueComponent
+                        if ct == "c":
+                            cp = WebVTTCueClassSpan(start_tag=st, internal_text=it)
+                        elif ct == "b":
+                            cp = WebVTTCueBoldSpan(start_tag=st, internal_text=it)
+                        elif ct == "i":
+                            cp = WebVTTCueItalicSpan(start_tag=st, internal_text=it)
+                        elif ct == "u":
+                            cp = WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
+                        elif ct == "lang":
+                            cp = WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
+                        elif ct == "v":
+                            cp = WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
+                        stack[-1].append(WebVTTCueComponentWithTerminator(component=cp))
+
+            pos = match.end()
+            i += 1
+
+        if pos < len(cue_text):
+            text = cue_text[pos:]
+            stack[-1].extend(cls._create_text_components(text))
+
+        return cls(
+            identifier=identifier,
+            timings=timings,
+            payload=stack[0],
+        )
+
+    def format(self, omit_hours_if_zero: bool = False, omit_voice_end: bool = False) -> str:
+        """Format the WebVTT cue block as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+            omit_voice_end: If True and this cue block has a WebVTT cue voice span as
+                its only component, omit the voice end tag for brevity.
+
+        Returns:
+            Formatted cue block string.
+        """
+        parts = []
+        if self.identifier:
+            parts.append(f"{self.identifier}\n")
+        timings_line = self.timings.format(omit_hours_if_zero=omit_hours_if_zero)
+        parts.append(timings_line + "\n")
+        for idx, span in enumerate(self.payload):
+            if omit_voice_end and idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
+                parts.append(str(span).removesuffix("</v>"))
+            else:
+                parts.append(str(span))
+
+        return "".join(parts) + "\n"
+
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT cue block.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        Always returns the WebVTT cue voice spans with the voice end tag, even if this
+        cue block has a WebVTT cue voice span as a single component in the payload. Use
+        `format(omit_voice_end=True)` to get a shorter representation without the voice
+        end tag.
+        """
+        return self.format()
+
+
+class WebVTTFile(BaseModel):
+    """A model representing a WebVTT file."""
+
+    _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
+    cue_blocks: list[WebVTTCueBlock]
+    title: str | None = None
+
+    @staticmethod
+    def verify_signature(content: str) -> bool:
+        """Verify the WebVTT file signature."""
+        if not content:
+            return False
+        elif len(content) == 6:
+            return content == "WEBVTT"
+        elif len(content) > 6 and content.startswith("WEBVTT"):
+            return content[6] in (" ", "\t", "\n")
+        else:
+            return False
+
+    @model_validator(mode="after")
+    def validate_start_time(self) -> Self:
+        """Validate cue start times.
+
+        The start time offset of the cue must be greater than or equal to the start
+        time offsets of all previous cues.
+        """
+        idx: int = 0
+        while idx < (len(self.cue_blocks) - 1):
+            if self.cue_blocks[idx + 1].timings.start < self.cue_blocks[idx].timings.start:
+                raise ValueError(
+                    f"The start time offset of block {idx + 1} must be greater than or"
+                    " equal to the start time offsets of all previous cues in the file"
+                )
+            idx += 1
+
+        return self
+
+    @classmethod
+    def parse(cls, raw: str) -> Self:
+        """Parse a WebVTT file.
+
+        Args:
+            raw: The raw WebVTT file content.
+
+        Returns:
+            The parsed WebVTT file.
+        """
+        # Normalize newlines to LF
+        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+
+        # Check WebVTT signature
+        if not cls.verify_signature(raw):
+            raise ValueError("Invalid WebVTT file signature")
+
+        # Strip "WEBVTT" header line
+        lines = raw.split("\n", 1)
+        title = lines[0].removeprefix("WEBVTT").strip() or None
+        body = lines[1] if len(lines) > 1 else ""
+
+        # Remove NOTE/STYLE/REGION blocks
+        body = re.sub(cls._pattern, "", body)
+
+        # Split into cue blocks
+        raw_blocks = re.split(r"\n\s*\n", body.strip())
+        cues: list[WebVTTCueBlock] = []
+        for block in raw_blocks:
+            try:
+                cues.append(WebVTTCueBlock.parse(block))
+            except ValueError as e:
+                warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning)
+
+        return cls(title=title, cue_blocks=cues)
+
+    def __iter__(self) -> Iterator[WebVTTCueBlock]:  # type: ignore[override]
+        """Return an iterator over the cue blocks."""
+        return iter(self.cue_blocks)
+
+    def __getitem__(self, idx) -> WebVTTCueBlock:
+        """Return the cue block at the given index."""
+        return self.cue_blocks[idx]
+
+    def __len__(self) -> int:
+        """Return the number of cue blocks."""
+        return len(self.cue_blocks)
+
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the WebVTT file as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+
+        Returns:
+            Formatted WebVTT file string.
+        """
+        parts: list[str] = []
+
+        if self.title:
+            parts.append(f"WEBVTT {self.title}\n")
+        else:
+            parts.append("WEBVTT\n")
+
+        for cue_block in self.cue_blocks:
+            parts.append("\n")
+            parts.append(cue_block.format(omit_hours_if_zero=omit_hours_if_zero))
+
+        # Remove the trailing newline from the last cue block
+        return "".join(parts).rstrip("\n")
+
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT file.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
+        return self.format()
diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
index 04761799..26042436 100644
--- a/docling_core/utils/legacy.py
+++ b/docling_core/utils/legacy.py
@@ -7,20 +7,23 @@
 
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     CoordOrigin,
     DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    GroupItem,
+    ListItem,
     PictureItem,
     ProvenanceItem,
     SectionHeaderItem,
     Size,
     TableCell,
+    TableData,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
 from docling_core.types.legacy_doc.base import (
     BaseCell,
     BaseText,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 03b7d8cd..6b617f28 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -238,6 +238,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -656,6 +675,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -798,6 +836,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -1203,6 +1260,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -1375,6 +1451,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -1751,6 +1846,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -2139,16 +2253,19 @@
       "type": "object"
     },
     "ProvenanceItem": {
-      "description": "ProvenanceItem.",
+      "description": "Provenance information for elements extracted from a textual document.\n\nA `ProvenanceItem` object acts as a lightweight pointer back into the original\ndocument for an extracted element. It applies to documents with an explicity\nor implicit layout, such as PDF, HTML, docx, or pptx.",
       "properties": {
         "page_no": {
+          "description": "Page number",
           "title": "Page No",
           "type": "integer"
         },
         "bbox": {
-          "$ref": "#/$defs/BoundingBox"
+          "$ref": "#/$defs/BoundingBox",
+          "description": "Bounding box"
         },
         "charspan": {
+          "description": "Character span (0-indexed)",
           "maxItems": 2,
           "minItems": 2,
           "prefixItems": [
@@ -2332,6 +2449,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -2627,6 +2763,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -2832,6 +2987,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -2944,6 +3118,25 @@
           "title": "Prov",
           "type": "array"
         },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
+              },
+              "propertyName": "kind"
+            },
+            "oneOf": [
+              {
+                "$ref": "#/$defs/TrackProvenance"
+              }
+            ]
+          },
+          "title": "Source",
+          "type": "array"
+        },
         "comments": {
           "default": [],
           "items": {
@@ -2997,6 +3190,189 @@
       ],
       "title": "TitleItem",
       "type": "object"
+    },
+    "TrackProvenance": {
+      "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.",
+      "properties": {
+        "kind": {
+          "const": "track",
+          "default": "track",
+          "description": "Identifiers this type of provenance.",
+          "title": "Kind",
+          "type": "string"
+        },
+        "start_time": {
+          "description": "Start time offset of the track cue in seconds",
+          "examples": [
+            11.0,
+            6.5,
+            5370.0
+          ],
+          "title": "Start Time",
+          "type": "number"
+        },
+        "end_time": {
+          "description": "End time offset of the track cue in seconds",
+          "examples": [
+            12.0,
+            8.2,
+            5370.1
+          ],
+          "title": "End Time",
+          "type": "number"
+        },
+        "identifier": {
+          "anyOf": [
+            {
+              "pattern": "^(?!.*-->)[^\\n\\r]+$",
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "An identifier of the cue",
+          "examples": [
+            "test",
+            "123",
+            "b72d946"
+          ],
+          "title": "Identifier"
+        },
+        "tags": {
+          "anyOf": [
+            {
+              "items": {
+                "anyOf": [
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTag"
+                  },
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated"
+                  }
+                ]
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
+          "examples": [
+            [
+              {
+                "annotation": "John",
+                "classes": [
+                  "loud"
+                ],
+                "name": "v"
+              }
+            ],
+            [
+              {
+                "classes": [
+                  "foreignphrase"
+                ],
+                "name": "i"
+              }
+            ]
+          ],
+          "title": "Tags"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "TrackProvenance",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTag": {
+      "description": "WebVTT cue span start tag.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
+          ],
+          "title": "Name",
+          "type": "string"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "List of classes representing the cue span's significance",
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "name"
+      ],
+      "title": "WebVTTCueSpanStartTag",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTagAnnotated": {
+      "description": "WebVTT cue span start tag requiring an annotation.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
+          ],
+          "title": "Name",
+          "type": "string"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "List of classes representing the cue span's significance",
+          "title": "Classes"
+        },
+        "annotation": {
+          "description": "Cue span start tag annotation",
+          "title": "Annotation",
+          "type": "string"
+        }
+      },
+      "required": [
+        "name",
+        "annotation"
+      ],
+      "title": "WebVTTCueSpanStartTagAnnotated",
+      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/data/doc/webvtt_example_01.gt.vtt b/test/data/doc/webvtt_example_01.gt.vtt
new file mode 100644
index 00000000..cad1c72a
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.gt.vtt
@@ -0,0 +1,40 @@
+WEBVTT
+
+00:00:11.000 --> 00:00:13.000
+<v Roger Bingham>We are in New York City</v>
+
+00:00:13.000 --> 00:00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street</v>
+
+00:00:16.000 --> 00:00:18.000
+<v Roger Bingham>from the American Museum of Natural History</v>
+
+00:00:18.000 --> 00:00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson</v>
+
+00:00:20.000 --> 00:00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium</v>
+
+00:00:22.000 --> 00:00:24.000
+<v Roger Bingham>at the AMNH.</v>
+
+00:00:24.000 --> 00:00:26.000
+<v Roger Bingham>Thank you for walking down here.</v>
+
+00:00:27.000 --> 00:00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.</v>
+
+00:00:30.000 --> 00:00:31.500
+<v Roger Bingham>When we e-mailed—</v>
+
+00:00:30.500 --> 00:00:32.500
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?</v>
+
+00:00:32.000 --> 00:00:35.500
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos</v>
+
+00:00:32.500 --> 00:00:33.500
+<v Neil deGrasse Tyson><i>Laughs</i></v>
+
+00:00:35.500 --> 00:00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
new file mode 100644
index 00000000..85d119be
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.json
@@ -0,0 +1,391 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_01",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 16887312431371817791,
+    "filename": "webvtt_example_01.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 11.0,
+          "end_time": 13.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "We are in New York City",
+      "text": "We are in New York City"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 13.0,
+          "end_time": 16.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "We’re actually at the Lucern Hotel, just down the street",
+      "text": "We’re actually at the Lucern Hotel, just down the street"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 16.0,
+          "end_time": 18.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "from the American Museum of Natural History",
+      "text": "from the American Museum of Natural History"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 18.0,
+          "end_time": 20.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "And with me is Neil deGrasse Tyson",
+      "text": "And with me is Neil deGrasse Tyson"
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 20.0,
+          "end_time": 22.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "Astrophysicist, Director of the Hayden Planetarium",
+      "text": "Astrophysicist, Director of the Hayden Planetarium"
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 22.0,
+          "end_time": 24.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "at the AMNH.",
+      "text": "at the AMNH."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 24.0,
+          "end_time": 26.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "Thank you for walking down here.",
+      "text": "Thank you for walking down here."
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 27.0,
+          "end_time": 30.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "And I want to do a follow-up on the last conversation we did.",
+      "text": "And I want to do a follow-up on the last conversation we did."
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 30.0,
+          "end_time": 31.5,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "When we e-mailed—",
+      "text": "When we e-mailed—"
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 30.5,
+          "end_time": 32.5,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
+        }
+      ],
+      "orig": "Didn’t we talk about enough in that conversation?",
+      "text": "Didn’t we talk about enough in that conversation?"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 32.0,
+          "end_time": 35.5,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+      "text": "No! No no no no; 'cos 'cos obviously 'cos"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 32.5,
+          "end_time": 33.5,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
+        }
+      ],
+      "orig": "Laughs",
+      "text": "Laughs",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 35.5,
+          "end_time": 38.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
+        }
+      ],
+      "orig": "You know I’m so excited my glasses are falling off here.",
+      "text": "You know I’m so excited my glasses are falling off here."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt
new file mode 100644
index 00000000..8f9811e7
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.gt.vtt
@@ -0,0 +1,16 @@
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+<v.first.loud Esme>It’s a blue apple tree!</v>
+
+00:00:02.000 --> 00:00:04.000
+<v Mary>No way!</v>
+
+00:00:04.000 --> 00:00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:00:06.000 --> 00:00:08.000
+<v.loud Mary>That’s awesome!</v>
+
+00:00:08.000 --> 00:00:10.000
+Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
new file mode 100644
index 00000000..55fd15ea
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.json
@@ -0,0 +1,308 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_02",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 8584853280299071027,
+    "filename": "webvtt_example_02.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 0.0,
+          "end_time": 2.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme",
+              "classes": [
+                "first",
+                "loud"
+              ]
+            }
+          ]
+        }
+      ],
+      "orig": "It\u2019s a blue apple tree!",
+      "text": "It\u2019s a blue apple tree!"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 2.0,
+          "end_time": 4.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary"
+            }
+          ]
+        }
+      ],
+      "orig": "No way!",
+      "text": "No way!"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme"
+            }
+          ]
+        }
+      ],
+      "orig": "Hee!",
+      "text": "Hee!"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": " ",
+      "text": " "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": "laughter",
+      "text": "laughter",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 6.0,
+          "end_time": 8.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary",
+              "classes": [
+                "loud"
+              ]
+            }
+          ]
+        }
+      ],
+      "orig": "That\u2019s awesome!",
+      "text": "That\u2019s awesome!"
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": "Sur les ",
+      "text": "Sur les "
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0,
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "en"
+            },
+            {
+              "name": "i",
+              "classes": [
+                "foreignphrase"
+              ]
+            }
+          ]
+        }
+      ],
+      "orig": "playground",
+      "text": "playground",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": ", ici \u00e0 Montpellier",
+      "text": ", ici \u00e0 Montpellier"
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.gt.vtt b/test/data/doc/webvtt_example_03.gt.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.gt.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
new file mode 100644
index 00000000..7b6faa6c
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.json
@@ -0,0 +1,503 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_03",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11620880316586573676,
+    "filename": "webvtt_example_03.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      },
+      {
+        "$ref": "#/texts/13"
+      },
+      {
+        "$ref": "#/texts/14"
+      },
+      {
+        "$ref": "#/texts/15"
+      },
+      {
+        "$ref": "#/texts/16"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "OK,",
+      "text": "OK,"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "I think now we should be recording",
+      "text": "I think now we should be recording"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.571,
+          "end_time": 9.403,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "properly.",
+      "text": "properly."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 10.683,
+          "end_time": 11.563,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+        }
+      ],
+      "orig": "Good.",
+      "text": "Good."
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 13.363,
+          "end_time": 13.803,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "Yeah.",
+      "text": "Yeah."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 49.603,
+          "end_time": 53.363,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "I was also thinking.",
+      "text": "I was also thinking."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 54.963,
+          "end_time": 62.072,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "Would be maybe good to create items,",
+      "text": "Would be maybe good to create items,"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "some metadata,",
+      "text": "some metadata,"
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "some options that can be specific.",
+      "text": "some options that can be specific."
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "Yeah,",
+      "text": "Yeah,"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "I mean I think you went even more than",
+      "text": "I mean I think you went even more than"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 70.563,
+          "end_time": 72.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "But we preserved the atoms.",
+      "text": "But we preserved the atoms."
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "than me.",
+      "text": "than me."
+    },
+    {
+      "self_ref": "#/texts/13",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "I just opened the format.",
+      "text": "I just opened the format."
+    },
+    {
+      "self_ref": "#/texts/14",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 110.222,
+          "end_time": 111.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "give it a try, yeah.",
+      "text": "give it a try, yeah."
+    },
+    {
+      "self_ref": "#/texts/15",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 112.043,
+          "end_time": 115.043,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
+        }
+      ],
+      "orig": "Okay, talk to you later.",
+      "text": "Okay, talk to you later."
+    },
+    {
+      "self_ref": "#/texts/16",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 114.603,
+          "end_time": 115.283,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
+        }
+      ],
+      "orig": "See you.",
+      "text": "See you."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt
new file mode 100644
index 00000000..ce7fcf65
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.gt.vtt
@@ -0,0 +1,9 @@
+WEBVTT Danger of Nitrogen
+
+00:00:01.000 --> 00:00:04.000
+Never drink liquid nitrogen.
+
+00:00:05.000 --> 00:00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
new file mode 100644
index 00000000..98e7da21
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.json
@@ -0,0 +1,210 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11822397499369478441,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/6"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "title",
+      "prov": [],
+      "orig": "Danger of Nitrogen",
+      "text": "Danger of Nitrogen"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 1.0,
+          "end_time": 4.0
+        }
+      ],
+      "orig": "Never drink liquid nitrogen.",
+      "text": "Never drink liquid nitrogen."
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 It will perforate your stomach.",
+      "text": "\u2014 It will perforate your stomach."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 You could ",
+      "text": "\u2014 You could "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
+          ]
+        }
+      ],
+      "orig": "die",
+      "text": "die",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": ".",
+      "text": "."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "John"
+            }
+          ]
+        }
+      ],
+      "orig": "This is true.",
+      "text": "This is true."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt
new file mode 100644
index 00000000..fd7b788c
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.gt.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
new file mode 100644
index 00000000..4af18174
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.json
@@ -0,0 +1,366 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 5389775195091554844,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        },
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14580.0,
+          "end_time": 14760.0,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "Last night the chef surprised us with a culinary adventure.",
+      "text": "Last night the chef surprised us with a culinary adventure."
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The waiter offered a ",
+      "text": "The waiter offered a "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "steaming bowl of ",
+      "text": "steaming bowl of ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "es-ES"
+            }
+          ]
+        }
+      ],
+      "orig": "paella",
+      "text": "paella",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+      "text": " that instantly transported the diners to a sunny Mediterranean coast."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The dessert\u2019s ",
+      "text": "The dessert\u2019s "
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
+          ]
+        }
+      ],
+      "orig": "unexpected",
+      "text": "unexpected",
+      "formatting": {
+        "bold": true,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " ",
+      "text": " ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "it"
+            }
+          ]
+        }
+      ],
+      "orig": "arcobaleno",
+      "text": "arcobaleno",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": true,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " of flavors",
+      "text": " of flavors",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " left everyone in awe.",
+      "text": " left everyone in awe."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/webvtt/webvtt_example_01.vtt b/test/data/webvtt/webvtt_example_01.vtt
new file mode 100644
index 00000000..333ca4a8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+<v Roger Bingham>We are in New York City
+
+00:13.000 --> 00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+<v Roger Bingham>from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+<v Roger Bingham>at the AMNH.
+
+00:24.000 --> 00:26.000
+<v Roger Bingham>Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+<v Roger Bingham>When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+<v Neil deGrasse Tyson><i>Laughs</i>
+
+00:35.500 --> 00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.
diff --git a/test/data/webvtt/webvtt_example_02.vtt b/test/data/webvtt/webvtt_example_02.vtt
new file mode 100644
index 00000000..1152a1e8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+<v.first.loud Esme>It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+<v Mary>No way!
+
+00:04.000 --> 00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:06.000 --> 00:08.000
+<v.loud Mary>That’s awesome!
\ No newline at end of file
diff --git a/test/data/webvtt/webvtt_example_03.vtt b/test/data/webvtt/webvtt_example_03.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 00000000..78b5ba0c
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,33 @@
+WEBVTT Danger of Nitrogen
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+STYLE
+::cue {
+    background-image: linear-gradient(to bottom, dimgray, lightgray);
+    color: papayawhip;
+}
+/* Style blocks cannot use blank lines nor "dash dash greater than" */
+
+REGION
+id:editor-comments
+width: 40%
+regionanchor:0%,100%
+viewportanchor:10%,90%
+
+REGION
+id:scroll
+width: 40%
+regionanchor:100%,100%
+viewportanchor:90%,90%
+scroll:up
+
+00:01.000 --> 00:04.000
+Never drink liquid nitrogen.
+
+NOTE I’m not sure the timing is right on the following cue.
+
+00:05.000 --> 00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py
index 58fb50db..28b41ad6 100644
--- a/test/test_deserializer_idoctags.py
+++ b/test/test_deserializer_idoctags.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from test.test_serialization_doctag import verify
 
 import pytest
 
@@ -21,7 +20,8 @@
     TableData,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from test.test_serialization_idoctag import add_texts_section, add_list_section
+from test.test_serialization_doctag import verify
+from test.test_serialization_idoctag import add_list_section, add_texts_section
 
 DO_PRINT: bool = False
 
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 709e2eac..5d569716 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,6 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -37,3 +38,52 @@ def test_prov():
     with pytest.raises(ValidationError, match="at least 2 items"):
         prov["span"] = [0]
         Prov(**prov)
+
+
+def test_track_provenance():
+    """Test the class TrackProvenance."""
+
+    valid_track = TrackProvenance(
+        start_time=11.0,
+        end_time=12.0,
+        identifier="test",
+        tags = [
+            {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]},
+            {"name": "lang", "annotation": "en"},
+            {"name": "lang", "annotation": "en-GB"},
+            {"name": "i", "classes": ["foreignphrase"]},
+        ]
+    )
+
+    assert valid_track
+    assert valid_track.start_time == 11.0
+    assert valid_track.end_time == 12.0
+    assert valid_track.identifier == "test"
+    assert valid_track.tags
+    assert valid_track.tags[0].annotation == "Mary"
+    assert valid_track.tags[0].classes == ["first", "loud"]
+    assert valid_track.tags[1].annotation == "en"
+    assert valid_track.tags[2].annotation == "en-GB"
+    assert valid_track.tags[3].classes == ["foreignphrase"]
+
+    with pytest.raises(ValidationError, match="end_time"):
+        TrackProvenance(start_time=11.0)
+
+    with pytest.raises(ValidationError, match="should be a valid dictionary"):
+        TrackProvenance(
+            start_time=11.0,
+            end_time=12.0,
+            tags=["en"],
+        )
+
+    with pytest.raises(ValidationError, match="must be greater than start"):
+        TrackProvenance(
+            start_time=11.0,
+            end_time=11.0,
+        )
+
+    doc = DoclingDocument(name="Unknown")
+    item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT)
+    item.source = [valid_track]
+    with pytest.raises(ValidationError, match="should be a valid list"):
+        item.source = "Invalid source"
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 6fe3b386..fd68a347 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -15,6 +15,7 @@
     MarkdownParams,
     OrigListItemMarkerMode,
 )
+from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer
 from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
@@ -563,3 +564,27 @@ def test_html_inline_and_formatting():
     ser = HTMLDocSerializer(doc=doc)
     actual = ser.serialize().text
     verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+
+
+# ===============================
+# WebVTT tests
+# ===============================
+
+
+@pytest.mark.parametrize(
+    "file_name",
+    [
+        "webvtt_example_01",
+        "webvtt_example_02",
+        "webvtt_example_03",
+        "webvtt_example_04",
+        "webvtt_example_05",
+    ],
+)
+def test_webvtt(file_name):
+    src = Path(f"./test/data/doc/{file_name}.json")
+    doc = DoclingDocument.load_from_json(src)
+
+    ser = WebVTTDocSerializer(doc=doc)
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual)
diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py
index 45d0c983..86237a9a 100644
--- a/test/test_serialization_doctag.py
+++ b/test/test_serialization_doctag.py
@@ -6,9 +6,7 @@
     DocTagsDocSerializer,
     DocTagsParams,
 )
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DoclingDocument, TableData
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TableData
 
 from .test_serialization import verify
 
diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py
index 43aaa79e..1c0f8479 100644
--- a/test/test_serialization_idoctag.py
+++ b/test/test_serialization_idoctag.py
@@ -2,37 +2,39 @@
 
 from pathlib import Path
 from typing import Optional
-from test.test_serialization import verify
 
 import pytest
 
 from docling_core.experimental.idoctags import (
     ContentType,
-    WrapMode,
     EscapeMode,
     IDocTagsDocSerializer,
     IDocTagsParams,
     IDocTagsSerializationMode,
     IDocTagsVocabulary,
+    WrapMode,
 )
 from docling_core.types.doc import (
+    BoundingBox,
+    CodeLanguageLabel,
+    CoordOrigin,
+    DescriptionMetaField,
     DocItemLabel,
     DoclingDocument,
     Formatting,
-    Script,
-    TableData,
-)
-from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.document import (
-    DescriptionMetaField,
+    PictureClassificationLabel,
     PictureClassificationMetaField,
     PictureClassificationPrediction,
     PictureMeta,
     ProvenanceItem,
+    Script,
+    Size,
     SummaryMetaField,
+    TableData,
     TabularChartMetaField,
 )
-from docling_core.types.doc.labels import CodeLanguageLabel, PictureClassificationLabel
+from test.test_serialization import verify
+
 
 def add_texts_section(doc: DoclingDocument):
     doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
@@ -427,7 +429,7 @@ def test_content_allow_all_types(sample_doc: DoclingDocument):
     serializer = IDocTagsDocSerializer(
         doc=doc,
         params=IDocTagsParams(
-            content_types={ct for ct in ContentType},
+            content_types=set(ContentType),
         ),
     )
     ser_txt = serializer.serialize().text
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
new file mode 100644
index 00000000..5b1693e3
--- /dev/null
+++ b/test/test_webvtt.py
@@ -0,0 +1,308 @@
+"""Test the data model for WebVTT files.
+
+Examples extracted from https://www.w3.org/TR/webvtt1/
+Copyright © 2019 World Wide Web Consortium.
+"""
+
+import warnings
+
+import pytest
+from pydantic import ValidationError
+
+from docling_core.types.doc.webvtt import (
+    WebVTTCueBlock,
+    WebVTTCueComponentWithTerminator,
+    WebVTTCueInternalText,
+    WebVTTCueItalicSpan,
+    WebVTTCueLanguageSpan,
+    WebVTTCueLanguageSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTextSpan,
+    WebVTTCueTimings,
+    WebVTTCueVoiceSpan,
+    WebVTTFile,
+    WebVTTTimestamp,
+)
+
+from .test_data_gen_flag import GEN_TEST_DATA
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents() -> None:
+    """Test WebVTT components."""
+    valid_timestamps = [
+        "00:01:02.345",
+        "12:34:56.789",
+        "02:34.567",
+        "00:00:00.000",
+    ]
+    valid_total_seconds = [
+        1 * 60 + 2.345,
+        12 * 3600 + 34 * 60 + 56.789,
+        2 * 60 + 34.567,
+        0.0,
+    ]
+    for idx, ts in enumerate(valid_timestamps):
+        model = WebVTTTimestamp(raw=ts)
+        assert model.seconds == valid_total_seconds[idx]
+
+    """Test invalid WebVTT timestamps."""
+    invalid_timestamps = [
+        "00:60:02.345",  # minutes > 59
+        "00:01:60.345",  # seconds > 59
+        "00:01:02.1000",  # milliseconds > 999
+        "01:02:03",  # missing milliseconds
+        "01:02",  # missing milliseconds
+        ":01:02.345",  # extra : for missing hours
+        "abc:01:02.345",  # invalid format
+    ]
+    for ts in invalid_timestamps:
+        with pytest.raises(ValidationError):
+            WebVTTTimestamp(raw=ts)
+
+    """Test the timestamp __str__ method."""
+    model = WebVTTTimestamp(raw="00:01:02.345")
+    assert str(model) == "00:01:02.345"
+
+    """Test valid cue timings."""
+    start = WebVTTTimestamp(raw="00:10.005")
+    end = WebVTTTimestamp(raw="00:14.007")
+    cue_timings = WebVTTCueTimings(start=start, end=end)
+    assert cue_timings.start == start
+    assert cue_timings.end == end
+    assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+    """Test invalid cue timings with end timestamp before start."""
+    start = WebVTTTimestamp(raw="00:10.700")
+    end = WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        WebVTTCueTimings(start=start, end=end)
+    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+    """Test invalid cue timings with missing end."""
+    start = WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        WebVTTCueTimings(start=start)  # type: ignore[call-arg]
+    assert "Field required" in str(excinfo.value)
+
+    """Test invalid cue timings with missing start."""
+    end = WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        WebVTTCueTimings(end=end)  # type: ignore[call-arg]
+    assert "Field required" in str(excinfo.value)
+
+    """Test with valid text."""
+    valid_text = "This is a valid cue text span."
+    span = WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+    assert str(span) == valid_text
+
+    """Test with text containing newline characters."""
+    invalid_text = "This cue text span\ncontains a newline."
+    with pytest.raises(ValidationError):
+        WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing ampersand."""
+    invalid_text = "This cue text span contains &."
+    with pytest.raises(ValidationError):
+        WebVTTCueTextSpan(text=invalid_text)
+    invalid_text = "An invalid &foo; entity"
+    with pytest.raises(ValidationError):
+        WebVTTCueTextSpan(text=invalid_text)
+    valid_text = "My favorite book is Pride &amp; Prejudice"
+    span = WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+
+    """Test with text containing less-than sign."""
+    invalid_text = "This cue text span contains <."
+    with pytest.raises(ValidationError):
+        WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with empty text."""
+    with pytest.raises(ValidationError):
+        WebVTTCueTextSpan(text="")
+
+    """Test that annotation validation works correctly."""
+    valid_annotation = "valid-annotation"
+    invalid_annotation = "invalid\nannotation"
+    with pytest.raises(ValidationError):
+        WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
+    assert WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
+
+    """Test that classes validation works correctly."""
+    annotation = "speaker name"
+    valid_classes = ["class1", "class2"]
+    invalid_classes = ["class\nwith\nnewlines", ""]
+    with pytest.raises(ValidationError):
+        WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation=annotation, classes=invalid_classes
+        )
+    assert WebVTTCueSpanStartTagAnnotated(
+        name="v", annotation=annotation, classes=valid_classes
+    )
+
+    """Test that components validation works correctly."""
+    annotation = "speaker name"
+    valid_components = [
+        WebVTTCueComponentWithTerminator(
+            component=WebVTTCueTextSpan(text="random text")
+        )
+    ]
+    invalid_components = [123, "not a component"]
+    with pytest.raises(ValidationError):
+        WebVTTCueInternalText(components=invalid_components)
+    assert WebVTTCueInternalText(components=valid_components)
+
+    """Test valid cue voice spans."""
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation="speaker", classes=["loud", "clear"]
+        ),
+        internal_text=WebVTTCueInternalText(
+            components=[
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
+    )
+    expected_str = "<v.loud.clear speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
+        internal_text=WebVTTCueInternalText(
+            components=[
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
+    )
+    expected_str = "<v speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+
+def test_webvttcueblock_parse() -> None:
+    """Test the method parse of _WebVTTCueBlock class."""
+    raw: str = (
+        "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n"
+    )
+    block: WebVTTCueBlock = WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:02.500 --> 04:05.000"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
+    assert (
+        block.payload[0].component.text
+        == "J’ai commencé le basket à l'âge de 13, 14 ans"
+    )
+    assert raw == str(block)
+
+    raw = (
+        "04:05.001 --> 04:07.800\n"
+        "Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier\n"
+    )
+    block = WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:05.001 --> 04:07.800"
+    assert len(block.payload) == 3
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Sur les "
+    assert isinstance(block.payload[1], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[1].component, WebVTTCueItalicSpan)
+    assert len(block.payload[1].component.internal_text.components) == 1
+    lang_span = block.payload[1].component.internal_text.components[0].component
+    assert isinstance(lang_span, WebVTTCueLanguageSpan)
+    assert isinstance(
+        lang_span.internal_text.components[0].component, WebVTTCueTextSpan
+    )
+    assert lang_span.internal_text.components[0].component.text == "playground"
+    assert isinstance(block.payload[2], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[2].component, WebVTTCueTextSpan)
+    assert block.payload[2].component.text == ", ici à Montpellier"
+    assert raw == str(block)
+
+
+def test_webvtt_file() -> None:
+    """Test WebVTT files."""
+    with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    block = vtt.cue_blocks[11]
+    assert str(block.timings) == "00:32.500 --> 00:33.500"
+    assert len(block.payload) == 1
+    cue_span = block.payload[0]
+    assert isinstance(cue_span.component, WebVTTCueVoiceSpan)
+    assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.component.start_tag.classes
+    assert len(cue_span.component.internal_text.components) == 1
+    comp = cue_span.component.internal_text.components[0]
+    assert isinstance(comp.component, WebVTTCueItalicSpan)
+    assert len(comp.component.internal_text.components) == 1
+    comp2 = comp.component.internal_text.components[0]
+    assert isinstance(comp2.component, WebVTTCueTextSpan)
+    assert comp2.component.text == "Laughs"
+
+    with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 4
+    reverse = (
+        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+        "https://www.w3.org/TR/webvtt1/\n\n"
+    )
+    reverse += "\n".join(
+        [
+            block.format(omit_hours_if_zero=True, omit_voice_end=True)
+            for block in vtt.cue_blocks
+        ]
+    )
+    assert content == reverse.rstrip()
+
+    with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    for block in vtt:
+        assert block.identifier
+    block = vtt.cue_blocks[0]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0].component, WebVTTCueVoiceSpan)
+    block = vtt.cue_blocks[2]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Good."
+    assert not vtt.title
+
+    with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
+        content = f.read()
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 2
+    block = vtt.cue_blocks[1]
+    assert len(block.payload) == 5
+    assert str(block) == (
+        "00:05.000 --> 00:09.000\n"
+        "— It will perforate your stomach.\n"
+        "— You could <b.loud>die</b>.\n"
+        "<v John>This is true.</v>\n"
+    )
+    assert vtt.title == "Danger of Nitrogen"
+
+
+def test_webvtt_cue_language_span_start_tag():
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')