Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
db920ef
refactor: move WebVTT data model from docling
ceberam Nov 14, 2025
2e9663e
fix(webvtt): deal with HTML entities in cue text spans
ceberam Nov 14, 2025
ea303db
refactor(webvtt): support more WebVTT models
ceberam Nov 17, 2025
0122141
refactor(DoclingDocument): create a new provenance model for media fi…
ceberam Nov 27, 2025
b9bb053
refactor(webvtt): make WebVTTTimestamp public
ceberam Dec 4, 2025
b26c086
refactor(webvtt): set languages to a list of strings in ProvenanceTrack
ceberam Dec 12, 2025
d0c97fc
tests(webvtt): add test for ProvenanceTrack
ceberam Dec 12, 2025
86d7fe4
refactor(webvtt): make all WebVTT classes public for reuse
ceberam Dec 12, 2025
82e80c0
chore(webvtt): preserve newlines as WebVTTLineTerminator
ceberam Dec 12, 2025
5721f09
refactor(webvtt): set ProvenanceTrack time fields as float
ceberam Dec 14, 2025
134cf95
chore(webvtt): ensure start time offsets are in sequence
ceberam Dec 15, 2025
3983b44
chore(webvtt): improve regex to remove note,region,style blocks
ceberam Dec 16, 2025
ff30e42
chore(webvtt): parse the WebVTT file title
ceberam Dec 16, 2025
6da51be
chore(webvtt): rebase to latest changes in idoctags
ceberam Jan 6, 2026
0a9e190
feat(webvtt): add WebVTT serializer
ceberam Jan 19, 2026
0b24861
fix(webvtt): add 'text/vtt' as extra mimetype
ceberam Jan 19, 2026
5e0a787
refactor(webvtt): roll back DocItem.prov as list of ProvenanceItem
ceberam Jan 22, 2026
00a355d
tests(webvtt): fix test with STYLE and NOTE blocks
ceberam Jan 23, 2026
818fc62
style(webvtt): apply X | Y annotation instead of Optional, Union
ceberam Jan 23, 2026
55fb835
refactor(webvtt): simplify TrackProvenance model with tags
ceberam Jan 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docling_core/transforms/serializer/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,10 @@
DocSerializer,
create_ser_result,
)
from docling_core.types.doc.base import CoordOrigin
from docling_core.types.doc.document import (
from docling_core.types.doc import (
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
FormItem,
InlineGroup,
Expand All @@ -59,7 +60,6 @@
TableItem,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel


def _bbox_to_polygon_coords(
Expand Down
8 changes: 4 additions & 4 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@
SerializationResult,
Span,
)
from docling_core.types.doc.document import (
DOCUMENT_TOKENS_EXPORT_LABELS,
from docling_core.types.doc import (
ContentLayer,
DescriptionAnnotation,
DocItem,
DocItemLabel,
DoclingDocument,
FloatingItem,
Formatting,
Expand All @@ -57,7 +57,7 @@
TableItem,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS

_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
_DEFAULT_LAYERS = set(ContentLayer)
Expand Down Expand Up @@ -317,7 +317,7 @@ def serialize_doc(
parts: list[SerializationResult],
**kwargs: Any,
) -> SerializationResult:
"""Serialize a document out of its pages."""
"""Serialize a document out of its parts."""
...

def _serialize_body(self, **kwargs) -> SerializationResult:
Expand Down
8 changes: 5 additions & 3 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
_should_use_legacy_annotations,
create_ser_result,
)
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.document import (
BoundingBox,
CodeItem,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentToken,
FloatingItem,
FormItem,
GroupItem,
Expand All @@ -40,17 +42,17 @@
ListItem,
NodeItem,
PictureClassificationData,
PictureClassificationLabel,
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
ProvenanceItem,
SectionHeaderItem,
TableData,
TableItem,
TableToken,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken


def _wrap(text: str, wrap_tag: str) -> str:
Expand Down
Loading
Loading