diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py index 385aca6a..1addf996 100644 --- a/docling_core/transforms/serializer/azure.py +++ b/docling_core/transforms/serializer/azure.py @@ -44,9 +44,10 @@ DocSerializer, create_ser_result, ) -from docling_core.types.doc.base import CoordOrigin -from docling_core.types.doc.document import ( +from docling_core.types.doc import ( + CoordOrigin, DocItem, + DocItemLabel, DoclingDocument, FormItem, InlineGroup, @@ -59,7 +60,6 @@ TableItem, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel def _bbox_to_polygon_coords( diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 3a8ad71c..43bfd54b 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -35,11 +35,11 @@ SerializationResult, Span, ) -from docling_core.types.doc.document import ( - DOCUMENT_TOKENS_EXPORT_LABELS, +from docling_core.types.doc import ( ContentLayer, DescriptionAnnotation, DocItem, + DocItemLabel, DoclingDocument, FloatingItem, Formatting, @@ -57,7 +57,7 @@ TableItem, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS _DEFAULT_LAYERS = set(ContentLayer) @@ -317,7 +317,7 @@ def serialize_doc( parts: list[SerializationResult], **kwargs: Any, ) -> SerializationResult: - """Serialize a document out of its pages.""" + """Serialize a document out of its parts.""" ... def _serialize_body(self, **kwargs) -> SerializationResult: diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index e5672638..dc8c520f 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -26,11 +26,13 @@ _should_use_legacy_annotations, create_ser_result, ) -from docling_core.types.doc.base import BoundingBox from docling_core.types.doc.document import ( + BoundingBox, CodeItem, DocItem, + DocItemLabel, DoclingDocument, + DocumentToken, FloatingItem, FormItem, GroupItem, @@ -40,6 +42,7 @@ ListItem, NodeItem, PictureClassificationData, + PictureClassificationLabel, PictureItem, PictureMoleculeData, PictureTabularChartData, @@ -47,10 +50,9 @@ SectionHeaderItem, TableData, TableItem, + TableToken, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel -from docling_core.types.doc.tokens import DocumentToken, TableToken def _wrap(text: str, wrap_tag: str) -> str: diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py new file mode 100644 index 00000000..eba06b36 --- /dev/null +++ b/docling_core/transforms/serializer/webvtt.py @@ -0,0 +1,556 @@ +"""Define classes for WebVTT serialization.""" + +import logging +import re +from typing import Any, get_args + +from pydantic import BaseModel +from typing_extensions import override + +from docling_core.transforms.serializer.base import ( + BaseAnnotationSerializer, + BaseDocSerializer, + BaseFallbackSerializer, + BaseFormSerializer, + BaseInlineSerializer, + BaseKeyValueSerializer, + BaseListSerializer, + BaseMetaSerializer, + BasePictureSerializer, + BaseTableSerializer, + BaseTextSerializer, + SerializationResult, +) +from docling_core.transforms.serializer.common import ( + CommonParams, + DocSerializer, + create_ser_result, +) +from docling_core.types.doc.document import ( + ContentLayer, + DocItem, + DocItemLabel, + DoclingDocument, + Formatting, + FormItem, + InlineGroup, + KeyValueItem, + ListGroup, + NodeItem, + PictureItem, + TableItem, + TextItem, + TitleItem, + TrackProvenance, +) +from docling_core.types.doc.webvtt import ( + START_TAG_NAMES, + WebVTTCueBlock, + WebVTTCueSpanStartTag, + WebVTTCueSpanStartTagAnnotated, + WebVTTCueTimings, + WebVTTFile, + WebVTTLineTerminator, + WebVTTTimestamp, +) + +_logger = logging.getLogger(__name__) + + +def _remove_consecutive_pairs(text: str) -> str: + """Remove one pass of consecutive start/end tag pairs. + + This function looks for patterns like where the tags are identical + and removes them. It handles two cases: + 1. Direct adjacent tags with content: contentwhitespace + 2. Tags with other tags in between: + + Args: + text: Input string + + Returns: + String with one pass of consecutive pairs removed + """ + # Pattern 1: Direct adjacent tags with same classes and annotations + pattern1 = re.compile( + r"<([bciuv]|lang)((?:\.\w+)*)(?:\s+([^>]+))?>" # Opening tag: capture tag, classes, annotation + r"((?:(?!).)*?)" # Content (non-greedy, not containing the closing tag) + r"" # Closing tag + r"(\s*)" # Capture whitespace between tags (including newlines) + r"<\1((?:\.\w+)*)(?:\s+([^>]+))?>" # Next opening tag: capture classes and annotation + ) + + def replacer1(match: re.Match[str]) -> str: + tag = match.group(1) + classes1 = match.group(2) or "" + anno1 = match.group(3) or "" + content = match.group(4) + whitespace = match.group(5) # Whitespace between tags + classes2 = match.group(6) or "" + anno2 = match.group(7) or "" + + # Only merge if classes and annotations match + if classes1 == classes2 and anno1 == anno2: + # Merge: remove the closing and opening tags, but keep the whitespace + return f"<{tag}{classes1}{' ' + anno1 if anno1 else ''}>{content}{whitespace}" + else: + # Don't merge - return original + return match.group(0) + + # Pattern 2: Tags with other tags in between + # This removes redundant and when there's another tag in between + pattern2 = re.compile( + r"" # Closing tag + r"(<[^>]+>)" # Any other tag in between + r"<\1(?:\.\w+)*(?:\s+[^>]+)?>" # Same opening tag (with any classes/annotations) + ) + + def replacer2(match: re.Match[str]) -> str: + # Just keep the middle tag, remove the closing and opening of the same type + return match.group(2) + + result = pattern1.sub(replacer1, text) + result = pattern2.sub(replacer2, result) + + return result + + +class WebVTTParams(CommonParams): + """Serialization parameters for the Web Video Text Tracks (WebVTT) format.""" + + layers: set[ContentLayer] = {ContentLayer.BODY} + + +class WebVTTTextSerializer(BaseModel, BaseTextSerializer): + """Text serializer to Web Video Text Tracks (WebVTT) format.""" + + @override + def serialize( + self, + *, + item: TextItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + is_inline_scope: bool = False, + visited: set[str] | None = None, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + # Handle TitleItem specially - it doesn't have provenance but we need its text + if isinstance(item, TitleItem): + return create_ser_result(text=item.text, span_source=item) + + # Only process items with TrackProvenance (WebVTT cues) + if not item.text or not item.source or item.source[0].kind != "track": + return create_ser_result() + + # Apply post-processing here: formatting, classes, language, and voice + # If the TextItem is part of an InlineGroup, we need to further post-process it + # within the group context + + prov: TrackProvenance = item.source[0] + text: str = doc_serializer.post_process( + text=item.text, + formatting=item.formatting, + tags=prov.tags, + ) + if is_inline_scope: + # Iteratively remove unnecessary consecutive tag pairs until no more changes + prev_text: str | None = None + while prev_text != text: + prev_text = text + text = _remove_consecutive_pairs(text) + + return create_ser_result(text=text, span_source=item) + + +class _WebVTTTableSerializer(BaseTableSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: TableItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTPictureSerializer(BasePictureSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: PictureItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTKeyValueSerializer(BaseKeyValueSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: KeyValueItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTFormSerializer(BaseFormSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: FormItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTFallbackSerializer(BaseFallbackSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (item, doc_serializer, doc, kwargs) + return create_ser_result() + + +class _WebVTTListSerializer(BaseModel, BaseListSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: ListGroup, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + list_level: int = 0, + is_inline_scope: bool = False, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs) + return create_ser_result() + + +class WebVTTInlineSerializer(BaseInlineSerializer): + """Inline group serializer to Web Video Text Tracks (WebVTT) format.""" + + @override + def serialize( + self, + *, + item: InlineGroup, + doc_serializer: "BaseDocSerializer", + doc: DoclingDocument, + list_level: int = 0, + visited: set[str] | None = None, + **kwargs: Any, + ) -> SerializationResult: + """Serializes an inline group to WebVTT format.""" + _ = doc + my_visited = visited if visited is not None else set() + parts = doc_serializer.get_parts( + item=item, + list_level=list_level, + is_inline_scope=True, + visited=my_visited, + **kwargs, + ) + # Include all parts, even if text is empty or whitespace-only + # Use 'is not None' instead of truthiness check to preserve whitespace + text_res = "".join([p.text for p in parts if p.text is not None]) + + # Apply tag normalization to the concatenated result + # Iteratively remove consecutive pairs until no more changes + prev_text = None + while prev_text != text_res: + prev_text = text_res + text_res = _remove_consecutive_pairs(text_res) + + return create_ser_result(text=text_res, span_source=parts) + + +class _WebVTTMetaSerializer(BaseModel, BaseMetaSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, item, kwargs) + return create_ser_result() + + +class _WebVTTAnnotationSerializer(BaseModel, BaseAnnotationSerializer): + """No-op for WebVTT output (not represented).""" + + @override + def serialize( + self, + *, + item: DocItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + _ = (doc, item, kwargs) + return create_ser_result() + + +class WebVTTDocSerializer(DocSerializer): + """Document serializer to Web Video Text Tracks (WebVTT) format.""" + + text_serializer: BaseTextSerializer = WebVTTTextSerializer() + table_serializer: BaseTableSerializer = _WebVTTTableSerializer() + picture_serializer: BasePictureSerializer = _WebVTTPictureSerializer() + key_value_serializer: BaseKeyValueSerializer = _WebVTTKeyValueSerializer() + form_serializer: BaseFormSerializer = _WebVTTFormSerializer() + fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer() + list_serializer: BaseListSerializer = _WebVTTListSerializer() + inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer() + meta_serializer: BaseMetaSerializer | None = _WebVTTMetaSerializer() + annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer() + + params: CommonParams = CommonParams() + + @override + def requires_page_break(self) -> bool: + """Whether to add page breaks. + + WebVTT format does not support page breaks. + """ + return False + + @override + def serialize_bold(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific bold serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("b", []) + + return self.serialize_cue_span( + text, + tag="b", + css=classes, + ) + + @override + def serialize_italic(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific italic serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("i", []) + + return self.serialize_cue_span( + text, + tag="i", + css=classes, + ) + + @override + def serialize_underline(self, text: str, **kwargs: Any) -> str: + """Apply WebVTT-specific underline serialization.""" + classes: list[str] = kwargs.get("classes", {}).get("u", []) + + return self.serialize_cue_span( + text, + tag="u", + css=classes, + ) + + def serialize_cue_span( + self, + text: str, + tag: START_TAG_NAMES, + anno: str | None = None, + css: list[str] | None = None, + ) -> str: + """Apply serialization to a WebVTT cue span.""" + start_tag: WebVTTCueSpanStartTag + if tag in {"b", "i", "u", "c"}: + start_tag = WebVTTCueSpanStartTag(name=tag, classes=css) + elif tag in {"v", "lang"}: + if not anno: + _logger.warning(f"Invalid {tag} cue span without annotation: {text}") + return text + else: + start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno) + else: + return text + + res: str = f"{start_tag}{text}" + return res + + @staticmethod + def _extract_classes(classes: list[str]) -> dict[str, list[str]]: + """Extract tag and values from provenance classes. + + Args: + classes: The classes from a TrackProvenance object. + + Returns: + Map of tag to class values. + """ + res: dict[str, list[str]] = {} + for item in classes or []: + for prefix in get_args(START_TAG_NAMES): + if item == prefix: + res[prefix] = [] + break + elif item.startswith(prefix + "."): + cls_str: str = item[len(prefix) + 1 :] + res[prefix] = cls_str.split(".") + break + return res + + @override + def serialize_doc( + self, + *, + parts: list[SerializationResult], + **kwargs: Any, + ) -> SerializationResult: + """Serialize a document out of its parts.""" + title: str | None = None + + timings: WebVTTCueTimings | None = None + id: str | None = None + text: str = "" + cue_blocks: list[WebVTTCueBlock] = [] + for part in parts: + if not part.text or not part.spans: + continue + + # Get the doc item from the first span + doc_item: DocItem = part.spans[0].item + + # Handle title items (check both TitleItem type and label) + if isinstance(doc_item, TitleItem) or ( + isinstance(doc_item, TextItem) and doc_item.label == DocItemLabel.TITLE + ): + title = part.text + continue + if isinstance(doc_item, InlineGroup) and doc_item.children: + doc_item = doc_item.children[0].resolve(doc=self.doc) + if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track": + prov: TrackProvenance = doc_item.source[0] + if ( + prov.identifier == id + and timings + and timings.start.seconds == prov.start_time + and timings.end.seconds == prov.end_time + ): + # When combining items with same timing, add newline and merge consecutive tags + combined = text.rstrip() + WebVTTLineTerminator.LF.value + part.text + # Use _remove_consecutive_pairs to merge tags like \n + # Iteratively remove consecutive pairs until no more changes + prev_combined = None + while prev_combined != combined: + prev_combined = combined + combined = _remove_consecutive_pairs(combined) + text = combined + WebVTTLineTerminator.LF.value + else: + if text: + cue_blocks.append(WebVTTCueBlock.parse(text)) + timings = WebVTTCueTimings( + start=WebVTTTimestamp.from_seconds(prov.start_time), + end=WebVTTTimestamp.from_seconds(prov.end_time), + ) + id = prov.identifier + text = ( + f"{id + WebVTTLineTerminator.LF.value if id else ''}{timings}" + f"{WebVTTLineTerminator.LF.value}{part.text}" + f"{WebVTTLineTerminator.LF.value}" + ) + if text: + cue_blocks.append(WebVTTCueBlock.parse(text)) + + webvtt_file = WebVTTFile(title=title, cue_blocks=cue_blocks) + content = str(webvtt_file) + return create_ser_result(text=content, span_source=parts) + + def post_process( + self, + text: str, + formatting: Formatting | None = None, + tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None, + **kwargs: Any, + ) -> str: + """Apply some text post-processing steps by adding formatting tags. + + The order of the formatting tags is determined by this function and `DocSerializer.post_process`, + from the innermost to the outermost: + 1. language () + 2. underline () + 3. italic () + 4. bold () + 5. class () + 6. voice () + """ + res: str = text + # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {} + + languages: list[WebVTTCueSpanStartTagAnnotated] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang" + ] + for lang in languages: + res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes) + + format_classes = { + item.name: item.classes + for item in tags or [] + if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"} + } + res = super().post_process(text=res, formatting=formatting, classes=format_classes) + + class_tag: list[WebVTTCueSpanStartTag] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c" + ] + if class_tag: + res = self.serialize_cue_span( + text=res, + tag="c", + css=class_tag[0].classes, + ) + + voice: list[WebVTTCueSpanStartTagAnnotated] = [ + item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v" + ] + if voice: + res = self.serialize_cue_span( + text=res, + tag="v", + anno=voice[0].annotation, + css=voice[0].classes, + ) + + return res diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py index 5ed7b843..89b07f77 100644 --- a/docling_core/transforms/visualizer/key_value_visualizer.py +++ b/docling_core/transforms/visualizer/key_value_visualizer.py @@ -16,8 +16,12 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc.document import ContentLayer, DoclingDocument -from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel +from docling_core.types.doc import ( + ContentLayer, + DoclingDocument, + GraphCellLabel, + GraphLinkLabel, +) # --------------------------------------------------------------------------- # Helper functions / constants diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py index 369a7b38..043fedac 100644 --- a/docling_core/transforms/visualizer/layout_visualizer.py +++ b/docling_core/transforms/visualizer/layout_visualizer.py @@ -10,10 +10,15 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc import DocItemLabel -from docling_core.types.doc.base import CoordOrigin -from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument -from docling_core.types.doc.page import BoundingRectangle, TextCell +from docling_core.types.doc import ( + BoundingRectangle, + ContentLayer, + CoordOrigin, + DocItem, + DocItemLabel, + DoclingDocument, + TextCell, +) class _TLBoundingRectangle(BoundingRectangle): diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py index 489a6d9a..5f601f9a 100644 --- a/docling_core/transforms/visualizer/table_visualizer.py +++ b/docling_core/transforms/visualizer/table_visualizer.py @@ -10,7 +10,7 @@ from typing_extensions import override from docling_core.transforms.visualizer.base import BaseVisualizer -from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem +from docling_core.types.doc import ContentLayer, DoclingDocument, TableItem _log = logging.getLogger(__name__) diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index 3c699f89..c3a2b237 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -46,6 +46,7 @@ PictureClassificationClass, PictureClassificationData, PictureClassificationMetaField, + PictureClassificationPrediction, PictureDataType, PictureItem, PictureLineChartData, @@ -56,17 +57,20 @@ PictureStackedBarChartData, PictureTabularChartData, ProvenanceItem, + ProvenanceType, RefItem, RichTableCell, Script, SectionHeaderItem, SummaryMetaField, + TableAnnotationType, TableCell, TableData, TableItem, TabularChartMetaField, TextItem, TitleItem, + TrackProvenance, UnorderedList, ) from .labels import ( diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 0ecc3e51..a9dd4aa8 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -65,6 +65,7 @@ ) from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import parse_otsl_table_content, relative_path +from docling_core.types.doc.webvtt import WebVTTCueIdentifier, WebVTTCueSpanStartTag, WebVTTCueSpanStartTagAnnotated _logger = logging.getLogger(__name__) @@ -958,6 +959,7 @@ class DocumentOrigin(BaseModel): "text/asciidoc", "text/markdown", "text/csv", + "text/vtt", "audio/x-wav", "audio/wav", "audio/mp3", @@ -1155,11 +1157,91 @@ def from_multipage_doctags_and_images( class ProvenanceItem(BaseModel): - """ProvenanceItem.""" + """Provenance information for elements extracted from a textual document. - page_no: int - bbox: BoundingBox - charspan: tuple[int, int] + A `ProvenanceItem` object acts as a lightweight pointer back into the original + document for an extracted element. It applies to documents with an explicity + or implicit layout, such as PDF, HTML, docx, or pptx. + """ + + page_no: Annotated[int, Field(description="Page number")] + bbox: Annotated[BoundingBox, Field(description="Bounding box")] + charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")] + + +class BaseProvenance(BaseModel): + """Base class for provenance information. + + Represents the provenance of an extracted component within a digital asset. + """ + + kind: Annotated[ + str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.") + ] + + +class TrackProvenance(BaseProvenance): + """Provenance metadata for a cue extracted from a media track. + + A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions, + etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle + block, an audio clip, or a timed marker in a screen-recording. + """ + + model_config = ConfigDict(regex_engine="python-re") + kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track" + start_time: Annotated[ + float, + Field( + examples=[11.0, 6.5, 5370.0], + description="Start time offset of the track cue in seconds", + ), + ] + end_time: Annotated[ + float, + Field( + examples=[12.0, 8.2, 5370.1], + description="End time offset of the track cue in seconds", + ), + ] + identifier: Annotated[ + WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"]) + ] = None + tags: Annotated[ + list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None, + Field( + description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).", + examples=[ + [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")], + [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])], + ], + ), + ] = None + + @model_validator(mode="after") + def check_order(self) -> Self: + """Ensure start time is less than the end time.""" + if self.end_time <= self.start_time: + raise ValueError("End time must be greater than start time") + return self + + +ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")] +"""Union type for all provenance types. + +This type alias represents a discriminated union of all available provenance types that can be associated with +extracted elements in a document. The `kind` field is used as a discriminator to determine the specific +provenance type at runtime. + +Currently supported provenance types: + - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles) + +Notes: + - Additional provenance types may be added to this union in the future to support + other content sources. + - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the + `ProvenanceItem` should still be used. +""" class ContentLayer(str, Enum): @@ -1317,7 +1399,7 @@ class PictureMeta(FloatingMeta): tabular_chart: Optional[TabularChartMetaField] = None -class NodeItem(BaseModel): +class NodeItem(BaseModel, validate_assignment=True): """NodeItem.""" self_ref: str = Field(pattern=_JSON_POINTER_REGEX) @@ -1464,20 +1546,28 @@ class FineRef(RefItem): range: Optional[tuple[int, int]] = None # start_inclusive, end_exclusive -class DocItem(NodeItem): # Base type for any element that carries content, can be a leaf node - """DocItem.""" +class DocItem(NodeItem): + """Base type for any element that carries content, can be a leaf node.""" label: DocItemLabel prov: list[ProvenanceItem] = [] + source: Annotated[ + list[ProvenanceType], + Field( + description="The provenance of this document item. Currently, it is only used for media track provenance." + ), + ] = [] comments: list[FineRef] = [] # References to comment items annotating this content @model_serializer(mode="wrap") def _custom_pydantic_serialize(self, handler: SerializerFunctionWrapHandler) -> dict: dumped = handler(self) - # suppress serializing comment list when empty: - if dumped.get("comments") == []: - del dumped["comments"] + # suppress serializing comment and source lists when empty: + for field in {"comments", "source"}: + if dumped.get(field) == []: + del dumped[field] + return dumped def get_location_tokens( @@ -1515,10 +1605,13 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL if a valid image of the page containing this DocItem is not available in doc. """ - if not len(self.prov): + if not self.prov or prov_index >= len(self.prov): + return None + prov = self.prov[prov_index] + if not isinstance(prov, ProvenanceItem): return None - page = doc.pages.get(self.prov[prov_index].page_no) + page = doc.pages.get(prov.page_no) if page is None or page.size is None or page.image is None: return None @@ -4625,7 +4718,7 @@ def _with_pictures_refs( image_dir.mkdir(parents=True, exist_ok=True) if image_dir.is_dir(): - for item, level in result.iterate_items(page_no=page_no, with_groups=False): + for item, _ in result.iterate_items(page_no=page_no, with_groups=False): if isinstance(item, PictureItem): img = item.get_image(doc=self) if img is not None: @@ -4647,7 +4740,8 @@ def _with_pictures_refs( if item.image is None: scale = img.size[0] / item.prov[0].bbox.width item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale)) - item.image.uri = Path(obj_path) + elif item.image is not None: + item.image.uri = Path(obj_path) # if item.image._pil is not None: # item.image._pil.close() diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py new file mode 100644 index 00000000..32bfc12d --- /dev/null +++ b/docling_core/types/doc/webvtt.py @@ -0,0 +1,696 @@ +"""Models for the Docling's adoption of Web Video Text Tracks format.""" + +import re +import warnings +from collections.abc import Iterator +from enum import Enum +from functools import total_ordering +from typing import Annotated, ClassVar, Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic.types import StringConstraints +from typing_extensions import Self, override + +_VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"} +_ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);") +START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"] + + +class WebVTTLineTerminator(str, Enum): + """WebVTT line terminator.""" + + CRLF = "\r\n" + LF = "\n" + CR = "\r" + + +WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")] + + +@total_ordering +class WebVTTTimestamp(BaseModel): + """WebVTT timestamp. + + The timestamp is a string consisting of the following components in the given order: + + - hours (optional, required if non-zero): two or more digits + - minutes: two digits between 0 and 59 + - a colon character (:) + - seconds: two digits between 0 and 59 + - a full stop character (.) + - thousandths of a second: three digits + + A WebVTT timestamp is always interpreted relative to the current playback position + of the media data that the WebVTT file is to be synchronized with. + """ + + model_config = ConfigDict(regex_engine="python-re") + + raw: Annotated[ + str, + Field(description="A representation of the WebVTT Timestamp as a single string"), + ] + + _pattern: ClassVar[re.Pattern] = re.compile(r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$") + _hours: int + _minutes: int + _seconds: int + _millis: int + + @model_validator(mode="after") + def validate_raw(self) -> Self: + """Validate the WebVTT timestamp as a string.""" + m = self._pattern.match(self.raw) + if not m: + raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}") + self._hours = int(m.group(1)) if m.group(1) else 0 + self._minutes = int(m.group(2)) + self._seconds = int(m.group(3)) + self._millis = int(m.group(4)) + + if self._minutes < 0 or self._minutes > 59: + raise ValueError("Minutes must be between 0 and 59") + if self._seconds < 0 or self._seconds > 59: + raise ValueError("Seconds must be between 0 and 59") + + return self + + @property + def seconds(self) -> float: + """A representation of the WebVTT Timestamp in seconds.""" + return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0 + + @classmethod + def from_seconds(cls, seconds: float) -> Self: + """Create a WebVTT timestamp from seconds. + + Args: + seconds: The time in seconds (can include fractional seconds for milliseconds). + + Returns: + A WebVTT timestamp instance. + """ + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + millis: int = round((seconds % 1) * 1000) + + return cls(raw=f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}") + + def __eq__(self, other: object) -> bool: + """Two timestamps are equal if their total number of seconds is equal.""" + if not isinstance(other, WebVTTTimestamp): + return NotImplemented + return self.seconds == other.seconds + + def __lt__(self, other: "WebVTTTimestamp") -> bool: + """Return True if this timestamp occurs before `other`.""" + if not isinstance(other, WebVTTTimestamp): + return NotImplemented + return self.seconds < other.seconds + + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the timestamp as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0. + + Returns: + Formatted timestamp string. + """ + if omit_hours_if_zero and self._hours == 0: + return f"{self._minutes:02d}:{self._seconds:02d}.{self._millis:03d}" + return self.raw + + @override + def __str__(self) -> str: + """Return a string representation of a WebVTT timestamp. + + Always returns the full timestamp format including hours (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation (MM:SS.mmm) when hours are zero. + """ + return self.raw + + +class WebVTTCueTimings(BaseModel): + """WebVTT cue timings.""" + + start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")] + end: Annotated[WebVTTTimestamp, Field(description="End time offset of the cue")] + + @model_validator(mode="after") + def check_order(self) -> Self: + """Ensure start timestamp is less than end timestamp.""" + if self.start and self.end: + if self.end <= self.start: + raise ValueError("End timestamp must be greater than start timestamp") + return self + + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the cue timings as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in both timestamps. + + Returns: + Formatted cue timings string in the format "start --> end". + """ + start_str = self.start.format(omit_hours_if_zero=omit_hours_if_zero) + end_str = self.end.format(omit_hours_if_zero=omit_hours_if_zero) + return f"{start_str} --> {end_str}" + + @override + def __str__(self) -> str: + """Return a string representation of the cue timings. + + Always returns the full format including hours (HH:MM:SS.mmm --> HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + """ + return f"{self.start} --> {self.end}" + + +class WebVTTCueTextSpan(BaseModel): + """WebVTT cue text span.""" + + kind: Literal["text"] = "text" + text: Annotated[str, Field(description="The cue text.")] + + @field_validator("text", mode="after") + @classmethod + def is_valid_text(cls, value: str) -> str: + """Ensure cue text contains only permitted characters and HTML entities.""" + for match in _ENTITY_PATTERN.finditer(value): + entity = match.group(1) + if entity not in _VALID_ENTITIES: + raise ValueError(f"Cue text contains an invalid HTML entity: &{entity};") + if "&" in re.sub(_ENTITY_PATTERN, "", value): + raise ValueError("Found '&' not part of a valid entity in the cue text") + if any(ch in value for ch in {"\n", "\r", "<"}): + raise ValueError("Cue text contains invalid characters") + if len(value) == 0: + raise ValueError("Cue text cannot be empty") + + return value + + @override + def __str__(self) -> str: + """Return a string representation of the cue text span.""" + return self.text + + +class WebVTTCueComponentWithTerminator(BaseModel): + """WebVTT caption or subtitle cue component optionally with a line terminator.""" + + component: "WebVTTCueComponent" + terminator: WebVTTLineTerminator | None = None + + @override + def __str__(self) -> str: + """Return a string representation of the cue component with terminator.""" + return f"{self.component}{self.terminator.value if self.terminator else ''}" + + +class WebVTTCueInternalText(BaseModel): + """WebVTT cue internal text.""" + + terminator: WebVTTLineTerminator | None = None + components: Annotated[ + list[WebVTTCueComponentWithTerminator], + Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")), + ] = [] + + @override + def __str__(self) -> str: + """Return a string representation of the cue internal text.""" + cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}" + return cue_str + + +class WebVTTCueSpanStartTag(BaseModel): + """WebVTT cue span start tag.""" + + name: Annotated[START_TAG_NAMES, Field(description="The tag name")] + classes: Annotated[ + list[str] | None, + Field(description="List of classes representing the cue span's significance"), + ] = None + + @field_validator("classes", mode="after") + @classmethod + def validate_classes(cls, value: list[str] | None) -> list[str] | None: + """Validate cue span start tag classes.""" + for item in value or []: + if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): + raise ValueError("A cue span start tag class contains invalid characters") + if not item: + raise ValueError("A cue span start tag class cannot be empty") + return value + + def _get_name_with_classes(self) -> str: + """Return the name of the cue span start tag with classes.""" + return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name + + @override + def __str__(self) -> str: + """Return a string representation of the cue span start tag.""" + return f"<{self._get_name_with_classes()}>" + + +class WebVTTCueSpanStartTagAnnotated(WebVTTCueSpanStartTag): + """WebVTT cue span start tag requiring an annotation.""" + + annotation: Annotated[str, Field(description="Cue span start tag annotation")] + + @field_validator("annotation", mode="after") + @classmethod + def is_valid_annotation(cls, value: str) -> str: + """Ensure annotation contains only permitted characters and HTML entities.""" + for match in _ENTITY_PATTERN.finditer(value): + entity = match.group(1) + if entity not in _VALID_ENTITIES: + raise ValueError(f"Annotation contains an invalid HTML entity: &{entity};") + if "&" in re.sub(_ENTITY_PATTERN, "", value): + raise ValueError("Found '&' not part of a valid entity in annotation") + if any(ch in value for ch in {"\n", "\r", ">"}): + raise ValueError("Annotation contains invalid characters") + if len(value) == 0: + raise ValueError("Annotation cannot be empty") + + return value + + @override + def __str__(self) -> str: + """Return a string representation of the cue span start tag.""" + return f"<{self._get_name_with_classes()} {self.annotation}>" + + +class WebVTTCueLanguageSpanStartTag(WebVTTCueSpanStartTagAnnotated): + """WebVTT cue language span start tag.""" + + _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE) + + name: Literal["lang"] = Field("lang", description="The tag name") + + @field_validator("annotation", mode="after") + @classmethod + @override + def is_valid_annotation(cls, value: str) -> str: + """Ensure that the language annotation is in BCP 47 language tag format.""" + if cls._pattern.match(value): + return value + else: + raise ValueError("Annotation should be in BCP 47 language tag format") + + +class WebVTTCueComponentBase(BaseModel): + """WebVTT caption or subtitle cue component. + + All the WebVTT caption or subtitle cue components are represented by this class + except the WebVTT cue text span, which requires different definitions. + """ + + kind: Literal["c", "b", "i", "u", "v", "lang"] + start_tag: WebVTTCueSpanStartTag + internal_text: WebVTTCueInternalText + + @model_validator(mode="after") + def check_tag_names_match(self) -> Self: + """Ensure that the start tag name matches this cue component type.""" + if self.kind != self.start_tag.name: + raise ValueError("The tag name of this cue component should be {self.kind}") + return self + + @override + def __str__(self) -> str: + """Return a string representation of the cue component.""" + return f"{self.start_tag}{self.internal_text}" + + +class WebVTTCueVoiceSpan(WebVTTCueComponentBase): + """WebVTT cue voice span associated with a specific voice.""" + + kind: Literal["v"] = "v" + start_tag: WebVTTCueSpanStartTagAnnotated + + +class WebVTTCueClassSpan(WebVTTCueComponentBase): + """WebVTT cue class span. + + It represents a span of text and it is used to annotate parts of the cue with + applicable classes without implying further meaning (such as italics or bold). + """ + + kind: Literal["c"] = "c" + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="c") + + +class WebVTTCueItalicSpan(WebVTTCueComponentBase): + """WebVTT cue italic span representing a span of italic text.""" + + kind: Literal["i"] = "i" + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="i") + + +class WebVTTCueBoldSpan(WebVTTCueComponentBase): + """WebVTT cue bold span representing a span of bold text.""" + + kind: Literal["b"] = "b" + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="b") + + +class WebVTTCueUnderlineSpan(WebVTTCueComponentBase): + """WebVTT cue underline span representing a span of underline text.""" + + kind: Literal["u"] = "u" + start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="u") + + +class WebVTTCueLanguageSpan(WebVTTCueComponentBase): + """WebVTT cue language span. + + It represents a span of text and it is used to annotate parts of the cue where the + applicable language might be different than the surrounding text's, without + implying further meaning (such as italics or bold). + """ + + kind: Literal["lang"] = "lang" + start_tag: WebVTTCueLanguageSpanStartTag + + +WebVTTCueComponent = Annotated[ + WebVTTCueTextSpan + | WebVTTCueClassSpan + | WebVTTCueItalicSpan + | WebVTTCueBoldSpan + | WebVTTCueUnderlineSpan + | WebVTTCueVoiceSpan + | WebVTTCueLanguageSpan, + Field( + discriminator="kind", + description="The type of WebVTT caption or subtitle cue component.", + ), +] + + +class WebVTTCueBlock(BaseModel): + """Model representing a WebVTT cue block. + + The optional WebVTT cue settings list is not supported. + The cue payload is limited to the following spans: text, class, italic, bold, + underline, and voice. + """ + + model_config = ConfigDict(regex_engine="python-re") + + identifier: Annotated[WebVTTCueIdentifier | None, Field(description="The WebVTT cue identifier")] = None + timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")] + payload: Annotated[ + list[WebVTTCueComponentWithTerminator], + Field(description="The WebVTT caption or subtitle cue text"), + ] + + # pattern of a WebVTT cue span start/end tag + _pattern_tag: ClassVar[re.Pattern] = re.compile( + r"<(?P/?)" + r"(?Pi|b|c|u|v|lang)" + r"(?P(?:\.[^\t\n\r &<>.]+)*)" + r"(?:[ \t](?P[^\n\r&>]*))?>" + ) + + @field_validator("payload", mode="after") + @classmethod + def validate_payload(cls, payload): + """Ensure that the cue payload contains valid text.""" + for voice in payload: + if "-->" in str(voice): + raise ValueError("Cue payload must not contain '-->'") + return payload + + @staticmethod + def _create_text_components( + text: str, + ) -> Iterator[WebVTTCueComponentWithTerminator]: + text_list = text.split("\n") + for idx, line in enumerate(text.split("\n")): + terminator = WebVTTLineTerminator.LF if idx < len(text_list) - 1 or text.endswith("\n") else None + if len(line) > 0: + yield WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text=line), + terminator=terminator, + ) + + @classmethod + def parse(cls, raw: str) -> Self: + """Parse a WebVTT cue block from a string. + + Args: + raw: The raw WebVTT cue block string. + + Returns: + The parsed WebVTT cue block. + """ + lines = raw.strip().splitlines() + if not lines: + raise ValueError("Cue block must have at least one line") + identifier: WebVTTCueIdentifier | None = None + timing_line = lines[0] + if "-->" not in timing_line and len(lines) > 1: + identifier = timing_line + timing_line = lines[1] + cue_lines = lines[2:] + else: + cue_lines = lines[1:] + + if "-->" not in timing_line: + raise ValueError("Cue block must contain WebVTT cue timings") + + start, end = [t.strip() for t in timing_line.split("-->")] + end = re.split(" |\t", end)[0] # ignore the cue settings list + timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end)) + cue_text = "\n".join(cue_lines).strip() + # adding close tag for cue spans without end tag + for omm in {"v"}: + if cue_text.startswith(f"<{omm}") and f"" not in cue_text: + cue_text += f"" + break + + stack: list[list[WebVTTCueComponentWithTerminator]] = [[]] + tag_stack: list[dict] = [] + + pos = 0 + matches = list(cls._pattern_tag.finditer(cue_text)) + i = 0 + while i < len(matches): + match = matches[i] + if match.start() > pos: + text = cue_text[pos : match.start()] + stack[-1].extend(cls._create_text_components(text)) + gps = {k: (v if v else None) for k, v in match.groupdict().items()} + + if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}: + if not gps["end"]: + tag_stack.append(gps) + stack.append([]) + else: + children = stack.pop() if stack else [] + if tag_stack: + closed = tag_stack.pop() + if (ct := closed["tag"]) != gps["tag"]: + raise ValueError(f"Incorrect end tag: {ct}") + class_string = closed["class"] + annotation = closed["annotation"] + classes: list[str] | None = None + if class_string: + classes = [c for c in class_string.split(".") if c] + st: WebVTTCueSpanStartTag + if annotation and ct == "lang": + st = WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip()) + elif annotation: + st = WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip()) + else: + st = WebVTTCueSpanStartTag(name=ct, classes=classes) + it = WebVTTCueInternalText(components=children) + cp: WebVTTCueComponent + if ct == "c": + cp = WebVTTCueClassSpan(start_tag=st, internal_text=it) + elif ct == "b": + cp = WebVTTCueBoldSpan(start_tag=st, internal_text=it) + elif ct == "i": + cp = WebVTTCueItalicSpan(start_tag=st, internal_text=it) + elif ct == "u": + cp = WebVTTCueUnderlineSpan(start_tag=st, internal_text=it) + elif ct == "lang": + cp = WebVTTCueLanguageSpan(start_tag=st, internal_text=it) + elif ct == "v": + cp = WebVTTCueVoiceSpan(start_tag=st, internal_text=it) + stack[-1].append(WebVTTCueComponentWithTerminator(component=cp)) + + pos = match.end() + i += 1 + + if pos < len(cue_text): + text = cue_text[pos:] + stack[-1].extend(cls._create_text_components(text)) + + return cls( + identifier=identifier, + timings=timings, + payload=stack[0], + ) + + def format(self, omit_hours_if_zero: bool = False, omit_voice_end: bool = False) -> str: + """Format the WebVTT cue block as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in the timings. + omit_voice_end: If True and this cue block has a WebVTT cue voice span as + its only component, omit the voice end tag for brevity. + + Returns: + Formatted cue block string. + """ + parts = [] + if self.identifier: + parts.append(f"{self.identifier}\n") + timings_line = self.timings.format(omit_hours_if_zero=omit_hours_if_zero) + parts.append(timings_line + "\n") + for idx, span in enumerate(self.payload): + if omit_voice_end and idx == 0 and len(self.payload) == 1 and span.component.kind == "v": + parts.append(str(span).removesuffix("")) + else: + parts.append(str(span)) + + return "".join(parts) + "\n" + + def __str__(self) -> str: + """Return a string representation of the WebVTT cue block. + + Always returns the full format including hours in timestamps (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + Always returns the WebVTT cue voice spans with the voice end tag, even if this + cue block has a WebVTT cue voice span as a single component in the payload. Use + `format(omit_voice_end=True)` to get a shorter representation without the voice + end tag. + """ + return self.format() + + +class WebVTTFile(BaseModel): + """A model representing a WebVTT file.""" + + _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)") + cue_blocks: list[WebVTTCueBlock] + title: str | None = None + + @staticmethod + def verify_signature(content: str) -> bool: + """Verify the WebVTT file signature.""" + if not content: + return False + elif len(content) == 6: + return content == "WEBVTT" + elif len(content) > 6 and content.startswith("WEBVTT"): + return content[6] in (" ", "\t", "\n") + else: + return False + + @model_validator(mode="after") + def validate_start_time(self) -> Self: + """Validate cue start times. + + The start time offset of the cue must be greater than or equal to the start + time offsets of all previous cues. + """ + idx: int = 0 + while idx < (len(self.cue_blocks) - 1): + if self.cue_blocks[idx + 1].timings.start < self.cue_blocks[idx].timings.start: + raise ValueError( + f"The start time offset of block {idx + 1} must be greater than or" + " equal to the start time offsets of all previous cues in the file" + ) + idx += 1 + + return self + + @classmethod + def parse(cls, raw: str) -> Self: + """Parse a WebVTT file. + + Args: + raw: The raw WebVTT file content. + + Returns: + The parsed WebVTT file. + """ + # Normalize newlines to LF + raw = raw.replace("\r\n", "\n").replace("\r", "\n") + + # Check WebVTT signature + if not cls.verify_signature(raw): + raise ValueError("Invalid WebVTT file signature") + + # Strip "WEBVTT" header line + lines = raw.split("\n", 1) + title = lines[0].removeprefix("WEBVTT").strip() or None + body = lines[1] if len(lines) > 1 else "" + + # Remove NOTE/STYLE/REGION blocks + body = re.sub(cls._pattern, "", body) + + # Split into cue blocks + raw_blocks = re.split(r"\n\s*\n", body.strip()) + cues: list[WebVTTCueBlock] = [] + for block in raw_blocks: + try: + cues.append(WebVTTCueBlock.parse(block)) + except ValueError as e: + warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning) + + return cls(title=title, cue_blocks=cues) + + def __iter__(self) -> Iterator[WebVTTCueBlock]: # type: ignore[override] + """Return an iterator over the cue blocks.""" + return iter(self.cue_blocks) + + def __getitem__(self, idx) -> WebVTTCueBlock: + """Return the cue block at the given index.""" + return self.cue_blocks[idx] + + def __len__(self) -> int: + """Return the number of cue blocks.""" + return len(self.cue_blocks) + + def format(self, omit_hours_if_zero: bool = False) -> str: + """Format the WebVTT file as a string. + + Args: + omit_hours_if_zero: If True, omit hours when they are 0 in the timings. + + Returns: + Formatted WebVTT file string. + """ + parts: list[str] = [] + + if self.title: + parts.append(f"WEBVTT {self.title}\n") + else: + parts.append("WEBVTT\n") + + for cue_block in self.cue_blocks: + parts.append("\n") + parts.append(cue_block.format(omit_hours_if_zero=omit_hours_if_zero)) + + # Remove the trailing newline from the last cue block + return "".join(parts).rstrip("\n") + + def __str__(self) -> str: + """Return a string representation of the WebVTT file. + + Always returns the full format including hours in timestamps (HH:MM:SS.mmm), + even when hours are zero. Use `format(omit_hours_if_zero=True)` to get + a shorter representation when hours are zero. + """ + return self.format() diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py index 04761799..26042436 100644 --- a/docling_core/utils/legacy.py +++ b/docling_core/utils/legacy.py @@ -7,20 +7,23 @@ from docling_core.types.doc import ( BoundingBox, + ContentLayer, CoordOrigin, DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, + ListItem, PictureItem, ProvenanceItem, SectionHeaderItem, Size, TableCell, + TableData, TableItem, TextItem, ) -from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData from docling_core.types.legacy_doc.base import ( BaseCell, BaseText, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 03b7d8cd..6b617f28 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -238,6 +238,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -656,6 +675,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -798,6 +836,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -1203,6 +1260,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -1375,6 +1451,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -1751,6 +1846,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -2139,16 +2253,19 @@ "type": "object" }, "ProvenanceItem": { - "description": "ProvenanceItem.", + "description": "Provenance information for elements extracted from a textual document.\n\nA `ProvenanceItem` object acts as a lightweight pointer back into the original\ndocument for an extracted element. It applies to documents with an explicity\nor implicit layout, such as PDF, HTML, docx, or pptx.", "properties": { "page_no": { + "description": "Page number", "title": "Page No", "type": "integer" }, "bbox": { - "$ref": "#/$defs/BoundingBox" + "$ref": "#/$defs/BoundingBox", + "description": "Bounding box" }, "charspan": { + "description": "Character span (0-indexed)", "maxItems": 2, "minItems": 2, "prefixItems": [ @@ -2332,6 +2449,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -2627,6 +2763,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -2832,6 +2987,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -2944,6 +3118,25 @@ "title": "Prov", "type": "array" }, + "source": { + "default": [], + "description": "The provenance of this document item. Currently, it is only used for media track provenance.", + "items": { + "discriminator": { + "mapping": { + "track": "#/$defs/TrackProvenance" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/TrackProvenance" + } + ] + }, + "title": "Source", + "type": "array" + }, "comments": { "default": [], "items": { @@ -2997,6 +3190,189 @@ ], "title": "TitleItem", "type": "object" + }, + "TrackProvenance": { + "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.", + "properties": { + "kind": { + "const": "track", + "default": "track", + "description": "Identifiers this type of provenance.", + "title": "Kind", + "type": "string" + }, + "start_time": { + "description": "Start time offset of the track cue in seconds", + "examples": [ + 11.0, + 6.5, + 5370.0 + ], + "title": "Start Time", + "type": "number" + }, + "end_time": { + "description": "End time offset of the track cue in seconds", + "examples": [ + 12.0, + 8.2, + 5370.1 + ], + "title": "End Time", + "type": "number" + }, + "identifier": { + "anyOf": [ + { + "pattern": "^(?!.*-->)[^\\n\\r]+$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An identifier of the cue", + "examples": [ + "test", + "123", + "b72d946" + ], + "title": "Identifier" + }, + "tags": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "$ref": "#/$defs/WebVTTCueSpanStartTag" + }, + { + "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).", + "examples": [ + [ + { + "annotation": "John", + "classes": [ + "loud" + ], + "name": "v" + } + ], + [ + { + "classes": [ + "foreignphrase" + ], + "name": "i" + } + ] + ], + "title": "Tags" + } + }, + "required": [ + "start_time", + "end_time" + ], + "title": "TrackProvenance", + "type": "object" + }, + "WebVTTCueSpanStartTag": { + "description": "WebVTT cue span start tag.", + "properties": { + "name": { + "description": "The tag name", + "enum": [ + "c", + "b", + "i", + "u", + "v", + "lang" + ], + "title": "Name", + "type": "string" + }, + "classes": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of classes representing the cue span's significance", + "title": "Classes" + } + }, + "required": [ + "name" + ], + "title": "WebVTTCueSpanStartTag", + "type": "object" + }, + "WebVTTCueSpanStartTagAnnotated": { + "description": "WebVTT cue span start tag requiring an annotation.", + "properties": { + "name": { + "description": "The tag name", + "enum": [ + "c", + "b", + "i", + "u", + "v", + "lang" + ], + "title": "Name", + "type": "string" + }, + "classes": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of classes representing the cue span's significance", + "title": "Classes" + }, + "annotation": { + "description": "Cue span start tag annotation", + "title": "Annotation", + "type": "string" + } + }, + "required": [ + "name", + "annotation" + ], + "title": "WebVTTCueSpanStartTagAnnotated", + "type": "object" } }, "description": "DoclingDocument.", diff --git a/test/data/doc/webvtt_example_01.gt.vtt b/test/data/doc/webvtt_example_01.gt.vtt new file mode 100644 index 00000000..cad1c72a --- /dev/null +++ b/test/data/doc/webvtt_example_01.gt.vtt @@ -0,0 +1,40 @@ +WEBVTT + +00:00:11.000 --> 00:00:13.000 +We are in New York City + +00:00:13.000 --> 00:00:16.000 +We’re actually at the Lucern Hotel, just down the street + +00:00:16.000 --> 00:00:18.000 +from the American Museum of Natural History + +00:00:18.000 --> 00:00:20.000 +And with me is Neil deGrasse Tyson + +00:00:20.000 --> 00:00:22.000 +Astrophysicist, Director of the Hayden Planetarium + +00:00:22.000 --> 00:00:24.000 +at the AMNH. + +00:00:24.000 --> 00:00:26.000 +Thank you for walking down here. + +00:00:27.000 --> 00:00:30.000 +And I want to do a follow-up on the last conversation we did. + +00:00:30.000 --> 00:00:31.500 +When we e-mailed— + +00:00:30.500 --> 00:00:32.500 +Didn’t we talk about enough in that conversation? + +00:00:32.000 --> 00:00:35.500 +No! No no no no; 'cos 'cos obviously 'cos + +00:00:32.500 --> 00:00:33.500 +Laughs + +00:00:35.500 --> 00:00:38.000 +You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json new file mode 100644 index 00000000..85d119be --- /dev/null +++ b/test/data/doc/webvtt_example_01.json @@ -0,0 +1,391 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_01", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 16887312431371817791, + "filename": "webvtt_example_01.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 11.0, + "end_time": 13.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "We are in New York City", + "text": "We are in New York City" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 13.0, + "end_time": 16.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "We’re actually at the Lucern Hotel, just down the street", + "text": "We’re actually at the Lucern Hotel, just down the street" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 16.0, + "end_time": 18.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "from the American Museum of Natural History", + "text": "from the American Museum of Natural History" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 18.0, + "end_time": 20.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "And with me is Neil deGrasse Tyson", + "text": "And with me is Neil deGrasse Tyson" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 20.0, + "end_time": 22.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "Astrophysicist, Director of the Hayden Planetarium", + "text": "Astrophysicist, Director of the Hayden Planetarium" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 22.0, + "end_time": 24.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "at the AMNH.", + "text": "at the AMNH." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 24.0, + "end_time": 26.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "Thank you for walking down here.", + "text": "Thank you for walking down here." + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 27.0, + "end_time": 30.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "And I want to do a follow-up on the last conversation we did.", + "text": "And I want to do a follow-up on the last conversation we did." + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 30.0, + "end_time": 31.5, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "When we e-mailed—", + "text": "When we e-mailed—" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 30.5, + "end_time": 32.5, + "tags": [ + { + "name": "v", + "annotation": "Neil deGrasse Tyson" + } + ] + } + ], + "orig": "Didn’t we talk about enough in that conversation?", + "text": "Didn’t we talk about enough in that conversation?" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 32.0, + "end_time": 35.5, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "No! No no no no; 'cos 'cos obviously 'cos", + "text": "No! No no no no; 'cos 'cos obviously 'cos" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 32.5, + "end_time": 33.5, + "tags": [ + { + "name": "v", + "annotation": "Neil deGrasse Tyson" + } + ] + } + ], + "orig": "Laughs", + "text": "Laughs", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 35.5, + "end_time": 38.0, + "tags": [ + { + "name": "v", + "annotation": "Roger Bingham" + } + ] + } + ], + "orig": "You know I’m so excited my glasses are falling off here.", + "text": "You know I’m so excited my glasses are falling off here." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt new file mode 100644 index 00000000..8f9811e7 --- /dev/null +++ b/test/data/doc/webvtt_example_02.gt.vtt @@ -0,0 +1,16 @@ +WEBVTT + +00:00:00.000 --> 00:00:02.000 +It’s a blue apple tree! + +00:00:02.000 --> 00:00:04.000 +No way! + +00:00:04.000 --> 00:00:06.000 +Hee! laughter + +00:00:06.000 --> 00:00:08.000 +That’s awesome! + +00:00:08.000 --> 00:00:10.000 +Sur les playground, ici à Montpellier \ No newline at end of file diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json new file mode 100644 index 00000000..55fd15ea --- /dev/null +++ b/test/data/doc/webvtt_example_02.json @@ -0,0 +1,308 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_02", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 8584853280299071027, + "filename": "webvtt_example_02.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 0.0, + "end_time": 2.0, + "tags": [ + { + "name": "v", + "annotation": "Esme", + "classes": [ + "first", + "loud" + ] + } + ] + } + ], + "orig": "It\u2019s a blue apple tree!", + "text": "It\u2019s a blue apple tree!" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 2.0, + "end_time": 4.0, + "tags": [ + { + "name": "v", + "annotation": "Mary" + } + ] + } + ], + "orig": "No way!", + "text": "No way!" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 4.0, + "end_time": 6.0, + "tags": [ + { + "name": "v", + "annotation": "Esme" + } + ] + } + ], + "orig": "Hee!", + "text": "Hee!" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": " ", + "text": " " + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": "laughter", + "text": "laughter", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 6.0, + "end_time": 8.0, + "tags": [ + { + "name": "v", + "annotation": "Mary", + "classes": [ + "loud" + ] + } + ] + } + ], + "orig": "That\u2019s awesome!", + "text": "That\u2019s awesome!" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": "Sur les ", + "text": "Sur les " + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 8.0, + "end_time": 10.0, + "tags": [ + { + "name": "lang", + "annotation": "en" + }, + { + "name": "i", + "classes": [ + "foreignphrase" + ] + } + ] + } + ], + "orig": "playground", + "text": "playground", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": ", ici \u00e0 Montpellier", + "text": ", ici \u00e0 Montpellier" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_03.gt.vtt b/test/data/doc/webvtt_example_03.gt.vtt new file mode 100644 index 00000000..a4dc1291 --- /dev/null +++ b/test/data/doc/webvtt_example_03.gt.vtt @@ -0,0 +1,57 @@ +WEBVTT + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +00:00:04.963 --> 00:00:08.571 +OK, +I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 +00:00:08.571 --> 00:00:09.403 +properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 +00:00:10.683 --> 00:00:11.563 +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 +00:00:13.363 --> 00:00:13.803 +Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 +00:00:49.603 --> 00:00:53.363 +I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 +00:00:54.963 --> 00:01:02.072 +Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 +00:01:02.072 --> 00:01:06.811 +some metadata, +some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 +00:01:10.243 --> 00:01:13.014 +Yeah, +I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 +00:01:10.563 --> 00:01:12.643 +But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +00:01:13.014 --> 00:01:15.907 +than me. +I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +00:01:50.222 --> 00:01:51.643 +give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +00:01:52.043 --> 00:01:55.043 +Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +00:01:54.603 --> 00:01:55.283 +See you. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json new file mode 100644 index 00000000..7b6faa6c --- /dev/null +++ b/test/data/doc/webvtt_example_03.json @@ -0,0 +1,503 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_03", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 11620880316586573676, + "filename": "webvtt_example_03.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "OK,", + "text": "OK," + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "I think now we should be recording", + "text": "I think now we should be recording" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 8.571, + "end_time": 9.403, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "properly.", + "text": "properly." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 10.683, + "end_time": 11.563, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + } + ], + "orig": "Good.", + "text": "Good." + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 13.363, + "end_time": 13.803, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "Yeah.", + "text": "Yeah." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 49.603, + "end_time": 53.363, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "I was also thinking.", + "text": "I was also thinking." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 54.963, + "end_time": 62.072, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "Would be maybe good to create items,", + "text": "Would be maybe good to create items," + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "some metadata,", + "text": "some metadata," + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "some options that can be specific.", + "text": "some options that can be specific." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "Yeah,", + "text": "Yeah," + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "I mean I think you went even more than", + "text": "I mean I think you went even more than" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 70.563, + "end_time": 72.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "But we preserved the atoms.", + "text": "But we preserved the atoms." + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "than me.", + "text": "than me." + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "I just opened the format.", + "text": "I just opened the format." + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 110.222, + "end_time": 111.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "give it a try, yeah.", + "text": "give it a try, yeah." + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 112.043, + "end_time": 115.043, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker B" + } + ] + } + ], + "orig": "Okay, talk to you later.", + "text": "Okay, talk to you later." + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 114.603, + "end_time": 115.283, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", + "tags": [ + { + "name": "v", + "annotation": "Speaker A" + } + ] + } + ], + "orig": "See you.", + "text": "See you." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt new file mode 100644 index 00000000..ce7fcf65 --- /dev/null +++ b/test/data/doc/webvtt_example_04.gt.vtt @@ -0,0 +1,9 @@ +WEBVTT Danger of Nitrogen + +00:00:01.000 --> 00:00:04.000 +Never drink liquid nitrogen. + +00:00:05.000 --> 00:00:09.000 +— It will perforate your stomach. +— You could die. +This is true. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json new file mode 100644 index 00000000..98e7da21 --- /dev/null +++ b/test/data/doc/webvtt_example_04.json @@ -0,0 +1,210 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_04", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 11822397499369478441, + "filename": "webvtt_example_04.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "Danger of Nitrogen", + "text": "Danger of Nitrogen" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 1.0, + "end_time": 4.0 + } + ], + "orig": "Never drink liquid nitrogen.", + "text": "Never drink liquid nitrogen." + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": "\u2014 It will perforate your stomach.", + "text": "\u2014 It will perforate your stomach." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": "\u2014 You could ", + "text": "\u2014 You could " + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 5.0, + "end_time": 9.0, + "tags": [ + { + "name": "b", + "classes": [ + "loud" + ] + } + ] + } + ], + "orig": "die", + "text": "die", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 5.0, + "end_time": 9.0 + } + ], + "orig": ".", + "text": "." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 5.0, + "end_time": 9.0, + "tags": [ + { + "name": "v", + "annotation": "John" + } + ] + } + ], + "orig": "This is true.", + "text": "This is true." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt new file mode 100644 index 00000000..fd7b788c --- /dev/null +++ b/test/data/doc/webvtt_example_05.gt.vtt @@ -0,0 +1,10 @@ +WEBVTT + +agcvs-08234 +04:03:00.000 --> 04:06:00.000 +Last night the chef surprised us with a culinary adventure. + +agcvs-08234 +04:06:00.000 --> 04:06:58.239 +The waiter offered a steaming bowl of paella that instantly transported the diners to a sunny Mediterranean coast. +The dessert’s unexpected arcobaleno of flavors left everyone in awe. \ No newline at end of file diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json new file mode 100644 index 00000000..4af18174 --- /dev/null +++ b/test/data/doc/webvtt_example_05.json @@ -0,0 +1,366 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_04", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 5389775195091554844, + "filename": "webvtt_example_04.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14580.0, + "end_time": 14760.0, + "identifier": "agcvs-08234" + } + ], + "orig": "Last night the chef surprised us with a culinary adventure.", + "text": "Last night the chef surprised us with a culinary adventure." + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The waiter offered a ", + "text": "The waiter offered a " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "steaming bowl of ", + "text": "steaming bowl of ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "tags": [ + { + "name": "lang", + "annotation": "es-ES" + } + ] + } + ], + "orig": "paella", + "text": "paella", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " that instantly transported the diners to a sunny Mediterranean coast.", + "text": " that instantly transported the diners to a sunny Mediterranean coast." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The dessert\u2019s ", + "text": "The dessert\u2019s " + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "tags": [ + { + "name": "b", + "classes": [ + "loud" + ] + } + ] + } + ], + "orig": "unexpected", + "text": "unexpected", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " ", + "text": " ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "tags": [ + { + "name": "lang", + "annotation": "it" + } + ] + } + ], + "orig": "arcobaleno", + "text": "arcobaleno", + "formatting": { + "bold": false, + "italic": true, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " of flavors", + "text": " of flavors", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "source": [ + { + "kind": "track", + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " left everyone in awe.", + "text": " left everyone in awe." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/test/data/webvtt/webvtt_example_01.vtt b/test/data/webvtt/webvtt_example_01.vtt new file mode 100644 index 00000000..333ca4a8 --- /dev/null +++ b/test/data/webvtt/webvtt_example_01.vtt @@ -0,0 +1,42 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:11.000 --> 00:13.000 +We are in New York City + +00:13.000 --> 00:16.000 +We’re actually at the Lucern Hotel, just down the street + +00:16.000 --> 00:18.000 +from the American Museum of Natural History + +00:18.000 --> 00:20.000 +And with me is Neil deGrasse Tyson + +00:20.000 --> 00:22.000 +Astrophysicist, Director of the Hayden Planetarium + +00:22.000 --> 00:24.000 +at the AMNH. + +00:24.000 --> 00:26.000 +Thank you for walking down here. + +00:27.000 --> 00:30.000 +And I want to do a follow-up on the last conversation we did. + +00:30.000 --> 00:31.500 align:right size:50% +When we e-mailed— + +00:30.500 --> 00:32.500 align:left size:50% +Didn’t we talk about enough in that conversation? + +00:32.000 --> 00:35.500 align:right size:50% +No! No no no no; 'cos 'cos obviously 'cos + +00:32.500 --> 00:33.500 align:left size:50% +Laughs + +00:35.500 --> 00:38.000 +You know I’m so excited my glasses are falling off here. diff --git a/test/data/webvtt/webvtt_example_02.vtt b/test/data/webvtt/webvtt_example_02.vtt new file mode 100644 index 00000000..1152a1e8 --- /dev/null +++ b/test/data/webvtt/webvtt_example_02.vtt @@ -0,0 +1,15 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:00.000 --> 00:02.000 +It’s a blue apple tree! + +00:02.000 --> 00:04.000 +No way! + +00:04.000 --> 00:06.000 +Hee! laughter + +00:06.000 --> 00:08.000 +That’s awesome! \ No newline at end of file diff --git a/test/data/webvtt/webvtt_example_03.vtt b/test/data/webvtt/webvtt_example_03.vtt new file mode 100644 index 00000000..a4dc1291 --- /dev/null +++ b/test/data/webvtt/webvtt_example_03.vtt @@ -0,0 +1,57 @@ +WEBVTT + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +00:00:04.963 --> 00:00:08.571 +OK, +I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 +00:00:08.571 --> 00:00:09.403 +properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 +00:00:10.683 --> 00:00:11.563 +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 +00:00:13.363 --> 00:00:13.803 +Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 +00:00:49.603 --> 00:00:53.363 +I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 +00:00:54.963 --> 00:01:02.072 +Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 +00:01:02.072 --> 00:01:06.811 +some metadata, +some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 +00:01:10.243 --> 00:01:13.014 +Yeah, +I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 +00:01:10.563 --> 00:01:12.643 +But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +00:01:13.014 --> 00:01:15.907 +than me. +I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +00:01:50.222 --> 00:01:51.643 +give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +00:01:52.043 --> 00:01:55.043 +Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +00:01:54.603 --> 00:01:55.283 +See you. \ No newline at end of file diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt new file mode 100644 index 00000000..78b5ba0c --- /dev/null +++ b/test/data/webvtt/webvtt_example_04.vtt @@ -0,0 +1,33 @@ +WEBVTT Danger of Nitrogen + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +STYLE +::cue { + background-image: linear-gradient(to bottom, dimgray, lightgray); + color: papayawhip; +} +/* Style blocks cannot use blank lines nor "dash dash greater than" */ + +REGION +id:editor-comments +width: 40% +regionanchor:0%,100% +viewportanchor:10%,90% + +REGION +id:scroll +width: 40% +regionanchor:100%,100% +viewportanchor:90%,90% +scroll:up + +00:01.000 --> 00:04.000 +Never drink liquid nitrogen. + +NOTE I’m not sure the timing is right on the following cue. + +00:05.000 --> 00:09.000 +— It will perforate your stomach. +— You could die. +This is true. \ No newline at end of file diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py index 58fb50db..28b41ad6 100644 --- a/test/test_deserializer_idoctags.py +++ b/test/test_deserializer_idoctags.py @@ -1,5 +1,4 @@ from pathlib import Path -from test.test_serialization_doctag import verify import pytest @@ -21,7 +20,8 @@ TableData, ) from docling_core.types.doc.labels import CodeLanguageLabel -from test.test_serialization_idoctag import add_texts_section, add_list_section +from test.test_serialization_doctag import verify +from test.test_serialization_idoctag import add_list_section, add_texts_section DO_PRINT: bool = False diff --git a/test/test_doc_base.py b/test/test_doc_base.py index 709e2eac..5d569716 100644 --- a/test/test_doc_base.py +++ b/test/test_doc_base.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError +from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance from docling_core.types.legacy_doc.base import Prov, S3Reference @@ -37,3 +38,52 @@ def test_prov(): with pytest.raises(ValidationError, match="at least 2 items"): prov["span"] = [0] Prov(**prov) + + +def test_track_provenance(): + """Test the class TrackProvenance.""" + + valid_track = TrackProvenance( + start_time=11.0, + end_time=12.0, + identifier="test", + tags = [ + {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]}, + {"name": "lang", "annotation": "en"}, + {"name": "lang", "annotation": "en-GB"}, + {"name": "i", "classes": ["foreignphrase"]}, + ] + ) + + assert valid_track + assert valid_track.start_time == 11.0 + assert valid_track.end_time == 12.0 + assert valid_track.identifier == "test" + assert valid_track.tags + assert valid_track.tags[0].annotation == "Mary" + assert valid_track.tags[0].classes == ["first", "loud"] + assert valid_track.tags[1].annotation == "en" + assert valid_track.tags[2].annotation == "en-GB" + assert valid_track.tags[3].classes == ["foreignphrase"] + + with pytest.raises(ValidationError, match="end_time"): + TrackProvenance(start_time=11.0) + + with pytest.raises(ValidationError, match="should be a valid dictionary"): + TrackProvenance( + start_time=11.0, + end_time=12.0, + tags=["en"], + ) + + with pytest.raises(ValidationError, match="must be greater than start"): + TrackProvenance( + start_time=11.0, + end_time=11.0, + ) + + doc = DoclingDocument(name="Unknown") + item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT) + item.source = [valid_track] + with pytest.raises(ValidationError, match="should be a valid list"): + item.source = "Invalid source" diff --git a/test/test_serialization.py b/test/test_serialization.py index 6fe3b386..fd68a347 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -15,6 +15,7 @@ MarkdownParams, OrigListItemMarkerMode, ) +from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( @@ -563,3 +564,27 @@ def test_html_inline_and_formatting(): ser = HTMLDocSerializer(doc=doc) actual = ser.serialize().text verify(exp_file=src.with_suffix(".gt.html"), actual=actual) + + +# =============================== +# WebVTT tests +# =============================== + + +@pytest.mark.parametrize( + "file_name", + [ + "webvtt_example_01", + "webvtt_example_02", + "webvtt_example_03", + "webvtt_example_04", + "webvtt_example_05", + ], +) +def test_webvtt(file_name): + src = Path(f"./test/data/doc/{file_name}.json") + doc = DoclingDocument.load_from_json(src) + + ser = WebVTTDocSerializer(doc=doc) + actual = ser.serialize().text + verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual) diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py index 45d0c983..86237a9a 100644 --- a/test/test_serialization_doctag.py +++ b/test/test_serialization_doctag.py @@ -6,9 +6,7 @@ DocTagsDocSerializer, DocTagsParams, ) -from docling_core.types.doc import DoclingDocument -from docling_core.types.doc.document import DoclingDocument, TableData -from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc import DocItemLabel, DoclingDocument, TableData from .test_serialization import verify diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py index 43aaa79e..1c0f8479 100644 --- a/test/test_serialization_idoctag.py +++ b/test/test_serialization_idoctag.py @@ -2,37 +2,39 @@ from pathlib import Path from typing import Optional -from test.test_serialization import verify import pytest from docling_core.experimental.idoctags import ( ContentType, - WrapMode, EscapeMode, IDocTagsDocSerializer, IDocTagsParams, IDocTagsSerializationMode, IDocTagsVocabulary, + WrapMode, ) from docling_core.types.doc import ( + BoundingBox, + CodeLanguageLabel, + CoordOrigin, + DescriptionMetaField, DocItemLabel, DoclingDocument, Formatting, - Script, - TableData, -) -from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size -from docling_core.types.doc.document import ( - DescriptionMetaField, + PictureClassificationLabel, PictureClassificationMetaField, PictureClassificationPrediction, PictureMeta, ProvenanceItem, + Script, + Size, SummaryMetaField, + TableData, TabularChartMetaField, ) -from docling_core.types.doc.labels import CodeLanguageLabel, PictureClassificationLabel +from test.test_serialization import verify + def add_texts_section(doc: DoclingDocument): doc.add_text(label=DocItemLabel.TEXT, text="Simple text") @@ -427,7 +429,7 @@ def test_content_allow_all_types(sample_doc: DoclingDocument): serializer = IDocTagsDocSerializer( doc=doc, params=IDocTagsParams( - content_types={ct for ct in ContentType}, + content_types=set(ContentType), ), ) ser_txt = serializer.serialize().text diff --git a/test/test_webvtt.py b/test/test_webvtt.py new file mode 100644 index 00000000..5b1693e3 --- /dev/null +++ b/test/test_webvtt.py @@ -0,0 +1,308 @@ +"""Test the data model for WebVTT files. + +Examples extracted from https://www.w3.org/TR/webvtt1/ +Copyright © 2019 World Wide Web Consortium. +""" + +import warnings + +import pytest +from pydantic import ValidationError + +from docling_core.types.doc.webvtt import ( + WebVTTCueBlock, + WebVTTCueComponentWithTerminator, + WebVTTCueInternalText, + WebVTTCueItalicSpan, + WebVTTCueLanguageSpan, + WebVTTCueLanguageSpanStartTag, + WebVTTCueSpanStartTagAnnotated, + WebVTTCueTextSpan, + WebVTTCueTimings, + WebVTTCueVoiceSpan, + WebVTTFile, + WebVTTTimestamp, +) + +from .test_data_gen_flag import GEN_TEST_DATA + +GENERATE = GEN_TEST_DATA + + +def test_vtt_cue_commponents() -> None: + """Test WebVTT components.""" + valid_timestamps = [ + "00:01:02.345", + "12:34:56.789", + "02:34.567", + "00:00:00.000", + ] + valid_total_seconds = [ + 1 * 60 + 2.345, + 12 * 3600 + 34 * 60 + 56.789, + 2 * 60 + 34.567, + 0.0, + ] + for idx, ts in enumerate(valid_timestamps): + model = WebVTTTimestamp(raw=ts) + assert model.seconds == valid_total_seconds[idx] + + """Test invalid WebVTT timestamps.""" + invalid_timestamps = [ + "00:60:02.345", # minutes > 59 + "00:01:60.345", # seconds > 59 + "00:01:02.1000", # milliseconds > 999 + "01:02:03", # missing milliseconds + "01:02", # missing milliseconds + ":01:02.345", # extra : for missing hours + "abc:01:02.345", # invalid format + ] + for ts in invalid_timestamps: + with pytest.raises(ValidationError): + WebVTTTimestamp(raw=ts) + + """Test the timestamp __str__ method.""" + model = WebVTTTimestamp(raw="00:01:02.345") + assert str(model) == "00:01:02.345" + + """Test valid cue timings.""" + start = WebVTTTimestamp(raw="00:10.005") + end = WebVTTTimestamp(raw="00:14.007") + cue_timings = WebVTTCueTimings(start=start, end=end) + assert cue_timings.start == start + assert cue_timings.end == end + assert str(cue_timings) == "00:10.005 --> 00:14.007" + + """Test invalid cue timings with end timestamp before start.""" + start = WebVTTTimestamp(raw="00:10.700") + end = WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + WebVTTCueTimings(start=start, end=end) + assert "End timestamp must be greater than start timestamp" in str(excinfo.value) + + """Test invalid cue timings with missing end.""" + start = WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + WebVTTCueTimings(start=start) # type: ignore[call-arg] + assert "Field required" in str(excinfo.value) + + """Test invalid cue timings with missing start.""" + end = WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + WebVTTCueTimings(end=end) # type: ignore[call-arg] + assert "Field required" in str(excinfo.value) + + """Test with valid text.""" + valid_text = "This is a valid cue text span." + span = WebVTTCueTextSpan(text=valid_text) + assert span.text == valid_text + assert str(span) == valid_text + + """Test with text containing newline characters.""" + invalid_text = "This cue text span\ncontains a newline." + with pytest.raises(ValidationError): + WebVTTCueTextSpan(text=invalid_text) + + """Test with text containing ampersand.""" + invalid_text = "This cue text span contains &." + with pytest.raises(ValidationError): + WebVTTCueTextSpan(text=invalid_text) + invalid_text = "An invalid &foo; entity" + with pytest.raises(ValidationError): + WebVTTCueTextSpan(text=invalid_text) + valid_text = "My favorite book is Pride & Prejudice" + span = WebVTTCueTextSpan(text=valid_text) + assert span.text == valid_text + + """Test with text containing less-than sign.""" + invalid_text = "This cue text span contains <." + with pytest.raises(ValidationError): + WebVTTCueTextSpan(text=invalid_text) + + """Test with empty text.""" + with pytest.raises(ValidationError): + WebVTTCueTextSpan(text="") + + """Test that annotation validation works correctly.""" + valid_annotation = "valid-annotation" + invalid_annotation = "invalid\nannotation" + with pytest.raises(ValidationError): + WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation) + assert WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation) + + """Test that classes validation works correctly.""" + annotation = "speaker name" + valid_classes = ["class1", "class2"] + invalid_classes = ["class\nwith\nnewlines", ""] + with pytest.raises(ValidationError): + WebVTTCueSpanStartTagAnnotated( + name="v", annotation=annotation, classes=invalid_classes + ) + assert WebVTTCueSpanStartTagAnnotated( + name="v", annotation=annotation, classes=valid_classes + ) + + """Test that components validation works correctly.""" + annotation = "speaker name" + valid_components = [ + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") + ) + ] + invalid_components = [123, "not a component"] + with pytest.raises(ValidationError): + WebVTTCueInternalText(components=invalid_components) + assert WebVTTCueInternalText(components=valid_components) + + """Test valid cue voice spans.""" + cue_span = WebVTTCueVoiceSpan( + start_tag=WebVTTCueSpanStartTagAnnotated( + name="v", annotation="speaker", classes=["loud", "clear"] + ), + internal_text=WebVTTCueInternalText( + components=[ + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") + ) + ] + ), + ) + expected_str = "random text" + assert str(cue_span) == expected_str + + cue_span = WebVTTCueVoiceSpan( + start_tag=WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"), + internal_text=WebVTTCueInternalText( + components=[ + WebVTTCueComponentWithTerminator( + component=WebVTTCueTextSpan(text="random text") + ) + ] + ), + ) + expected_str = "random text" + assert str(cue_span) == expected_str + + +def test_webvttcueblock_parse() -> None: + """Test the method parse of _WebVTTCueBlock class.""" + raw: str = ( + "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n" + ) + block: WebVTTCueBlock = WebVTTCueBlock.parse(raw) + assert str(block.timings) == "04:02.500 --> 04:05.000" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) + assert ( + block.payload[0].component.text + == "J’ai commencé le basket à l'âge de 13, 14 ans" + ) + assert raw == str(block) + + raw = ( + "04:05.001 --> 04:07.800\n" + "Sur les playground, ici à Montpellier\n" + ) + block = WebVTTCueBlock.parse(raw) + assert str(block.timings) == "04:05.001 --> 04:07.800" + assert len(block.payload) == 3 + assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) + assert block.payload[0].component.text == "Sur les " + assert isinstance(block.payload[1], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[1].component, WebVTTCueItalicSpan) + assert len(block.payload[1].component.internal_text.components) == 1 + lang_span = block.payload[1].component.internal_text.components[0].component + assert isinstance(lang_span, WebVTTCueLanguageSpan) + assert isinstance( + lang_span.internal_text.components[0].component, WebVTTCueTextSpan + ) + assert lang_span.internal_text.components[0].component.text == "playground" + assert isinstance(block.payload[2], WebVTTCueComponentWithTerminator) + assert isinstance(block.payload[2].component, WebVTTCueTextSpan) + assert block.payload[2].component.text == ", ici à Montpellier" + assert raw == str(block) + + +def test_webvtt_file() -> None: + """Test WebVTT files.""" + with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: + content = f.read() + vtt = WebVTTFile.parse(content) + assert len(vtt) == 13 + block = vtt.cue_blocks[11] + assert str(block.timings) == "00:32.500 --> 00:33.500" + assert len(block.payload) == 1 + cue_span = block.payload[0] + assert isinstance(cue_span.component, WebVTTCueVoiceSpan) + assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson" + assert not cue_span.component.start_tag.classes + assert len(cue_span.component.internal_text.components) == 1 + comp = cue_span.component.internal_text.components[0] + assert isinstance(comp.component, WebVTTCueItalicSpan) + assert len(comp.component.internal_text.components) == 1 + comp2 = comp.component.internal_text.components[0] + assert isinstance(comp2.component, WebVTTCueTextSpan) + assert comp2.component.text == "Laughs" + + with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: + content = f.read() + vtt = WebVTTFile.parse(content) + assert len(vtt) == 4 + reverse = ( + "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " + "https://www.w3.org/TR/webvtt1/\n\n" + ) + reverse += "\n".join( + [ + block.format(omit_hours_if_zero=True, omit_voice_end=True) + for block in vtt.cue_blocks + ] + ) + assert content == reverse.rstrip() + + with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: + content = f.read() + vtt = WebVTTFile.parse(content) + assert len(vtt) == 13 + for block in vtt: + assert block.identifier + block = vtt.cue_blocks[0] + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" + assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" + assert len(block.payload) == 1 + assert isinstance(block.payload[0].component, WebVTTCueVoiceSpan) + block = vtt.cue_blocks[2] + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" + assert len(block.payload) == 1 + assert isinstance(block.payload[0].component, WebVTTCueTextSpan) + assert block.payload[0].component.text == "Good." + assert not vtt.title + + with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f: + content = f.read() + with warnings.catch_warnings(): + warnings.simplefilter("error") + vtt = WebVTTFile.parse(content) + assert len(vtt) == 2 + block = vtt.cue_blocks[1] + assert len(block.payload) == 5 + assert str(block) == ( + "00:05.000 --> 00:09.000\n" + "— It will perforate your stomach.\n" + "— You could die.\n" + "This is true.\n" + ) + assert vtt.title == "Danger of Nitrogen" + + +def test_webvtt_cue_language_span_start_tag(): + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}') + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}') + with pytest.raises(ValidationError, match="BCP 47"): + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}') + with pytest.raises(ValidationError, match="BCP 47"): + WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')