docling-project
diff --git a/‎docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py‎
Lines changed: 10 additions & 4 deletions b/‎docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎docling_eval/campaign_tools/cvat_deliveries_to_hf.py‎
Lines changed: 148 additions & 83 deletions b/‎docling_eval/campaign_tools/cvat_deliveries_to_hf.py‎
Lines changed: 148 additions & 83 deletions
@@ -32,6 +32,7 @@
 
 from docling_eval.cli.main import create_cvat, create_eval, create_gt
 from docling_eval.datamodels.types import BenchMarkNames, PredictionProviderType
+from docling_eval.utils.utils import count_pages_in_file
 
 app = typer.Typer(add_completion=False)
 
@@ -67,14 +68,19 @@ def process_subdirectories(
     for subdir in subdirs:
         subdir_name = subdir.name
 
-        # Count total files in subdirectory
-        total_files = 0
+        # Collect all files and count pages
+        all_files: list[Path] = []
         for ext in ["pdf", "tif", "tiff", "jpg", "jpeg", "png", "bmp", "gif", "json"]:
-            total_files += len(list(subdir.glob(f"*.{ext}")))
-            total_files += len(list(subdir.glob(f"*.{ext.upper()}")))
+            all_files.extend(subdir.glob(f"*.{ext}"))
+            all_files.extend(subdir.glob(f"*.{ext.upper()}"))
+        all_files.sort()
+
+        total_files = len(all_files)
+        total_pages = sum(count_pages_in_file(f) for f in all_files)
 
         typer.echo(f"\nProcessing: {subdir_name}")
         typer.echo(f"  Total files found: {total_files}")
+        typer.echo(f"  Total pages found: {total_pages}")
 
         # Calculate number of chunks needed
         num_chunks = (total_files + max_files_per_chunk - 1) // max_files_per_chunk
 
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Sequence
+from typing import Dict, Iterable, List, Sequence
 
 import typer
 
@@ -19,7 +19,7 @@
 app = typer.Typer(
     help=(
         "Aggregate DoclingDocument JSON exports produced by the CVAT delivery "
-        "pipeline into HuggingFace-ready parquet datasets, grouped by subset name."
+        "pipeline into a single HuggingFace-ready parquet dataset with subset tags."
     ),
     no_args_is_help=True,
     add_completion=False,
@@ -42,14 +42,14 @@ class ConfigEntry:
 
 
 @dataclass(frozen=True)
-class SubsetBuildStats:
-    name: str
-    submissions: int
+class CombinedBuildStats:
     record_count: int
+    submission_count: int
+    subsets: List[str]
 
     def as_config(self, dataset_dir_name: str, split: str) -> ConfigEntry:
-        pattern = f"{self.name}/{dataset_dir_name}/{split}/shard_*.parquet"
-        return ConfigEntry(name=self.name, split=split, path_pattern=pattern)
+        pattern = f"{dataset_dir_name}/{split}/shard_*.parquet"
+        return ConfigEntry(name="default", split=split, path_pattern=pattern)
 
 
 FEATURE_FIELD_RENDER: Dict[FieldType, tuple[str, str]] = {
@@ -109,23 +109,37 @@ def link_file(source: Path, destination: Path) -> None:
 
 def populate_staging_dir(
     staging_dir: Path,
-    source_dirs: Sequence[Path],
-) -> int:
+    subset_sources: Dict[str, List[Path]],
+) -> tuple[int, Dict[str, str]]:
+    """
+    Populate staging directory with files from all subsets.
+
+    Returns:
+        Tuple of (total_file_count, mapping from staged_filename to subset_name)
+    """
     file_index = 0
+    file_to_subset: Dict[str, str] = {}
 
-    for dir_idx, source_dir in enumerate(source_dirs):
-        json_files = sorted(p for p in source_dir.glob("*.json") if p.is_file())
-        if not json_files:
-            _LOGGER.warning("No JSON files found under %s", source_dir)
-            continue
-
-        for json_file in json_files:
-            link_name = f"{dir_idx:03d}_{file_index:06d}_{json_file.name}"
-            destination = staging_dir / link_name
-            link_file(json_file, destination)
-            file_index += 1
+    for subset_name in sorted(subset_sources.keys()):
+        for dir_idx, source_dir in enumerate(subset_sources[subset_name]):
+            json_files = sorted(p for p in source_dir.glob("*.json") if p.is_file())
+            if not json_files:
+                _LOGGER.warning("No JSON files found under %s", source_dir)
+                continue
+
+            for json_file in json_files:
+                # Prefix with subset name so we can extract it later
+                link_name = (
+                    f"{subset_name}_{dir_idx:03d}_{file_index:06d}_{json_file.name}"
+                )
+                destination = staging_dir / link_name
+                link_file(json_file, destination)
+                # Map the staged filename (without extension) to subset name
+                staged_stem = destination.stem
+                file_to_subset[staged_stem] = subset_name
+                file_index += 1
 
-    return file_index
+    return file_index, file_to_subset
 
 
 def read_num_rows(dataset_root: Path) -> int:
@@ -144,18 +158,38 @@ def read_num_rows(dataset_root: Path) -> int:
     return 0
 
 
-def build_subset_dataset(
-    subset_name: str,
-    subset_dirs: Sequence[Path],
-    staging_root: Path,
+def iter_records_with_tags(
+    builder: FileDatasetBuilder, file_to_subset: Dict[str, str]
+) -> Iterable[DatasetRecord]:
+    """
+    Iterate over records from FileDatasetBuilder and add subset tags.
+
+    The builder creates records with doc_id from filename.stem, so we match
+    the staged filename stem to find the subset name.
+    """
+    for record in builder.iterate():
+        # FileDatasetBuilder sets doc_id to filename.stem
+        # Match it to our file_to_subset mapping
+        subset_name = file_to_subset.get(record.doc_id)
+        if subset_name:
+            record.tags.append(f"subset:{subset_name}")
+        yield record
+
+
+def build_combined_dataset(
+    subset_sources: Dict[str, List[Path]],
+    staging_dir: Path,
     output_root: Path,
     dataset_dir_name: str,
     split: str,
     chunk_size: int,
     export_kind: DeliveryExportKind,
     force: bool,
-) -> SubsetBuildStats | None:
-    target_root = output_root / subset_name / dataset_dir_name
+) -> CombinedBuildStats | None:
+    """
+    Build a single combined dataset from all subsets with subset tags.
+    """
+    target_root = output_root / dataset_dir_name
     if target_root.exists():
         if not force:
             raise RuntimeError(
@@ -166,85 +200,117 @@ def build_subset_dataset(
         )
         shutil.rmtree(target_root)
 
-    ensure_clean_dir(staging_root / subset_name)
-    staging_dir = staging_root / subset_name
-    processed = populate_staging_dir(staging_dir, subset_dirs)
+    ensure_clean_dir(staging_dir)
+    processed, file_to_subset = populate_staging_dir(staging_dir, subset_sources)
     if processed == 0:
-        _LOGGER.warning("Subset %s had no JSON payloads. Skipping.", subset_name)
+        _LOGGER.warning("No JSON payloads found across all subsets. Skipping.")
         shutil.rmtree(staging_dir)
         return None
 
     builder = FileDatasetBuilder(
-        name=f"{subset_name}-{export_kind.value}",
+        name=f"combined-{export_kind.value}",
         dataset_source=staging_dir,
         target=target_root,
         split=split,
         file_extensions=["json"],
     )
-    builder.save_to_disk(chunk_size=chunk_size)
+
+    # Custom save logic that adds tags
+    from docling.utils.utils import chunkify
+
+    from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info
+
+    test_dir = target_root / split
+    test_dir.mkdir(parents=True, exist_ok=True)
+
+    count = 0
+    chunk_count = 0
+
+    for record_chunk in chunkify(
+        iter_records_with_tags(builder, file_to_subset), chunk_size
+    ):
+        record_list = [r.as_record_dict() for r in record_chunk]
+        save_shard_to_disk(
+            items=record_list,
+            dataset_path=test_dir,
+            schema=DatasetRecord.pyarrow_schema(),
+            shard_id=chunk_count,
+        )
+        count += len(record_list)
+        chunk_count += 1
+
+    write_datasets_info(
+        name=builder.name,
+        output_dir=target_root,
+        num_train_rows=0,
+        num_test_rows=count,
+        features=DatasetRecord.features(),
+    )
+
     shutil.rmtree(staging_dir)
 
     record_count = read_num_rows(target_root)
-    submission_count = len(subset_dirs)
+    submission_count = sum(len(dirs) for dirs in subset_sources.values())
+    subset_names = sorted(subset_sources.keys())
 
-    return SubsetBuildStats(
-        name=subset_name,
-        submissions=submission_count,
+    return CombinedBuildStats(
         record_count=record_count if record_count else processed,
+        submission_count=submission_count,
+        subsets=subset_names,
     )
 
 
-def render_configs_block(configs: Sequence[ConfigEntry]) -> List[str]:
+def render_configs_block(config: ConfigEntry) -> List[str]:
     lines: List[str] = ["configs:"]
-    for entry in configs:
-        lines.append(f"- config_name: {entry.name}")
-        lines.append("  data_files:")
-        lines.append(f"    - split: {entry.split}")
-        lines.append(f"      path: {entry.path_pattern}")
+    lines.append(f"- config_name: {config.name}")
+    lines.append("  data_files:")
+    lines.append(f"    - split: {config.split}")
+    lines.append(f"      path: {config.path_pattern}")
     return lines
 
 
-def render_dataset_info_block(configs: Sequence[ConfigEntry]) -> List[str]:
+def render_dataset_info_block(config: ConfigEntry) -> List[str]:
     lines: List[str] = ["dataset_info:"]
     feature_rows = iter_dataset_features()
-    for entry in configs:
-        lines.append(f"- config_name: {entry.name}")
-        lines.append("  features:")
-        for feature_name, attr, value in feature_rows:
-            lines.append(f"  - name: {feature_name}")
-            lines.append(f"    {attr}: {value}")
-        lines.append("")
+    lines.append(f"- config_name: {config.name}")
+    lines.append("  features:")
+    for feature_name, attr, value in feature_rows:
+        lines.append(f"  - name: {feature_name}")
+        lines.append(f"    {attr}: {value}")
+    lines.append("")
     return lines
 
 
 def write_readme(
     output_root: Path,
-    configs: Sequence[ConfigEntry],
-    stats: Sequence[SubsetBuildStats],
+    config: ConfigEntry,
+    stats: CombinedBuildStats,
     license_name: str,
     export_kind: DeliveryExportKind,
 ) -> None:
     lines: List[str] = ["---"]
-    lines.extend(render_configs_block(configs))
-    lines.extend(render_dataset_info_block(configs))
+    lines.extend(render_configs_block(config))
+    lines.extend(render_dataset_info_block(config))
     lines.append(f"license: {license_name}")
     lines.append("---")
     lines.append("")
     lines.append("# CVAT Delivery Aggregation")
     lines.append(
         "This repository consolidates DoclingDocument exports produced by the "
-        "CVAT delivery pipeline into HuggingFace-ready parquet datasets."
+        "CVAT delivery pipeline into a single HuggingFace-ready parquet dataset."
     )
     lines.append("")
     lines.append(f"- Source payloads: `{export_kind.folder_name()}`")
     lines.append("- Builder: `FileDatasetBuilder` with JSON inputs")
+    lines.append(
+        "- Subset information: Each record includes a `tags` field with "
+        "`subset:<name>` entries indicating the source subset"
+    )
     lines.append("")
-    lines.append("## Subsets")
-    for subset in stats:
-        lines.append(
-            f"- `{subset.name}`: {subset.record_count} documents from "
-            f"{subset.submissions} submissions"
-        )
+    lines.append("## Dataset Statistics")
+    lines.append(f"- Total records: {stats.record_count}")
+    lines.append(f"- Total submissions: {stats.submission_count}")
+    lines.append(f"- Subsets included: {', '.join(f'`{s}`' for s in stats.subsets)}")
     readme_path = output_root / "README.md"
     with readme_path.open("w", encoding="utf-8") as handle:
         handle.write("\n".join(lines).rstrip() + "\n")
@@ -270,7 +336,7 @@ def main(
     dataset_dir_name: str = typer.Option(
         "gt_dataset",
         "--dataset-dir-name",
-        help="Name of the dataset directory created inside each subset folder.",
+        help="Name of the dataset directory created in the output folder.",
     ),
     chunk_size: int = typer.Option(
         80,
@@ -285,7 +351,7 @@ def main(
     force: bool = typer.Option(
         False,
         "--force/--no-force",
-        help="Overwrite any existing subset dataset directories and staging data.",
+        help="Overwrite any existing dataset directory and staging data.",
     ),
 ) -> None:
     logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
@@ -308,33 +374,32 @@ def main(
         )
         raise typer.Exit(code=1)
 
-    built_stats: List[SubsetBuildStats] = []
-    for subset_name in sorted(subset_sources.keys()):
-        stats = build_subset_dataset(
-            subset_name=subset_name,
-            subset_dirs=sorted(subset_sources[subset_name]),
-            staging_root=staging_root,
-            output_root=output_dir,
-            dataset_dir_name=dataset_dir_name,
-            split=split,
-            chunk_size=chunk_size,
-            export_kind=export_kind,
-            force=force,
-        )
-        if stats:
-            built_stats.append(stats)
+    # Sort subset sources for deterministic ordering
+    sorted_subset_sources = {k: sorted(v) for k, v in sorted(subset_sources.items())}
+
+    staging_dir = staging_root / "combined"
+    stats = build_combined_dataset(
+        subset_sources=sorted_subset_sources,
+        staging_dir=staging_dir,
+        output_root=output_dir,
+        dataset_dir_name=dataset_dir_name,
+        split=split,
+        chunk_size=chunk_size,
+        export_kind=export_kind,
+        force=force,
+    )
 
     shutil.rmtree(staging_root, ignore_errors=True)
 
-    if not built_stats:
+    if not stats:
         typer.echo("No datasets were produced.", err=True)
         raise typer.Exit(code=1)
 
-    configs = [stat.as_config(dataset_dir_name, split) for stat in built_stats]
+    config = stats.as_config(dataset_dir_name, split)
     write_readme(
         output_root=output_dir,
-        configs=configs,
-        stats=built_stats,
+        config=config,
+        stats=stats,
         license_name=license_name,
         export_kind=export_kind,
     )