Merge remote-tracking branch 'origin/development' into development

fffoivos · fffoivos · commit 1adac28b5b94 · 2026-03-09T00:26:20.000Z
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# Contributing to GlossAPI
+
+## Working branches and PR flow
+- Open PRs are pushed against the `development` branch.
+- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint.  
+
+## Some design principles
+- Corpus methods should be easy to use and descriptive.
+- Python files should be readable and well organized (check folder structure).
+- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to  these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline.
+
+## Pipeline awareness and folder layout
+- Tie any pipeline change to the artifacts it produces. Common touchpoints:
+  - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`).
+  - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders.
+  - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`.
+- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable.
+
+## Keep changes small
+- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting.
diff --git a/src/glossapi/corpus/phase_export.py b/src/glossapi/corpus/phase_export.py
@@ -471,26 +471,18 @@ def _normalize_value(value: Any) -> Any:
             chunk_paths: List[Path] = entry.get("chunk_paths", []) or []
             base_path: Optional[Path] = entry.get("base_path")
             representative_path: Optional[Path] = base_path
-            if representative_path is None and chunk_paths:
-                representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0]
             base_metadata = metadata_by_stem.get(stem)
             chunk_metadata = metadata_chunks_by_stem.get(stem, [])
             if base_metadata is None and not chunk_metadata:
                 continue
             metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata)
             metadata = {k: _normalize_value(v) for k, v in metadata.items()}
             original_filename_value = metadata.get("filename")
-            if chunk_paths:
-                ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key)
-                parts: List[str] = []
-                for path in ordered_chunks:
-                    parts.append(path.read_text(encoding="utf-8"))
-                document_text = "\n".join(parts)
-            elif representative_path is not None:
-                document_text = representative_path.read_text(encoding="utf-8")
-            else:
+            if base_path is None or not base_path.exists():
                 continue
 
+            document_text = base_path.read_text(encoding="utf-8")
+
             filetype = metadata.get("filetype") or metadata.get("file_ext")
             if not filetype:
                 filename_candidate = original_filename_value or metadata.get("filename")
diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py
@@ -813,6 +813,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
         except Exception as e:
             self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}")
 
+        # Always attempt to assemble whatever chunks succeeded (best-effort)
+        out_md_path = output_dir / f"{stem}.md"
+        final_md_written = False
+        if all_segments:
+            try:
+                final_md = "\n\n".join(all_segments)
+                out_md_path.write_text(final_md, encoding="utf-8")
+                final_md_written = True
+            except Exception as e:
+                self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}")
+
         if not completed:
             # Record failure/timeout provenance in parquet
             try:
@@ -827,6 +838,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
                     chunk_size=self.chunk_size,
                     chunk_count=len(manifest.get("entries", [])),
                     chunk_manifest_path=manifest_path,
+                    no_partial_output=not final_md_written,
                 )
             except Exception as e:
                 self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}")
@@ -838,14 +850,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
                     self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}")
             return False
 
-        # Assemble final markdown
-        try:
-            final_md = "\n\n".join(all_segments)
-            out_md_path = output_dir / f"{stem}.md"
-            with out_md_path.open("w", encoding="utf-8") as fp:
-                fp.write(final_md)
-        except Exception as e:
-            self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}")
+        if not final_md_written:
             return False
         # Record success provenance in parquet
         try:
diff --git a/tests/test_jsonl_export.py b/tests/test_jsonl_export.py
@@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path):
     assert len(seen_doc_ids) == len(texts)
 
 
+def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path):
+    corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks")
+
+    base_text = "## Base Title\n\nMerged body from extraction."
+    base_path = corpus.cleaned_markdown_dir / "chunked.md"
+    base_path.parent.mkdir(parents=True, exist_ok=True)
+    base_path.write_text(base_text, encoding="utf-8")
+
+    chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked"
+    chunk_dir.mkdir(parents=True, exist_ok=True)
+    (chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8")
+    (chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8")
+
+    _write_download_results(
+        corpus.output_dir / "download_results" / "download_results.parquet",
+        [
+            {
+                "filename": "chunked.pdf",
+                "filter": "ok",
+                "needs_ocr": False,
+                "is_empty": False,
+                "char_count_no_comments": 10,
+            }
+        ],
+    )
+
+    out_path = corpus.output_dir / "chunked.jsonl"
+    corpus.jsonl(out_path)
+
+    record = json.loads(out_path.read_text(encoding="utf-8").strip())
+    assert record["document"] == base_text
+
+
 @pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed")
 def test_hf_streaming_loader_example(tmp_path):
     corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7")
@@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path):
     table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019))
 
     assert set(table.column("doc_id").to_pylist()) == {"a"}
+
 def _expected_doc_id(filename: str) -> str:
     return hashlib.sha256(filename.encode("utf-8")).hexdigest()