Skip to content

Commit 1adac28

Browse files
committed
Merge remote-tracking branch 'origin/development' into development
2 parents ab87731 + 6bcde8b commit 1adac28

File tree

4 files changed

+70
-19
lines changed

4 files changed

+70
-19
lines changed

CONTRIBUTING.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Contributing to GlossAPI
2+
3+
## Working branches and PR flow
4+
- Open PRs are pushed against the `development` branch.
5+
- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint.
6+
7+
## Some design principles
8+
- Corpus methods should be easy to use and descriptive.
9+
- Python files should be readable and well organized (check folder structure).
10+
- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline.
11+
12+
## Pipeline awareness and folder layout
13+
- Tie any pipeline change to the artifacts it produces. Common touchpoints:
14+
- `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`).
15+
- `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders.
16+
- `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`.
17+
- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable.
18+
19+
## Keep changes small
20+
- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting.

src/glossapi/corpus/phase_export.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -471,26 +471,18 @@ def _normalize_value(value: Any) -> Any:
471471
chunk_paths: List[Path] = entry.get("chunk_paths", []) or []
472472
base_path: Optional[Path] = entry.get("base_path")
473473
representative_path: Optional[Path] = base_path
474-
if representative_path is None and chunk_paths:
475-
representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0]
476474
base_metadata = metadata_by_stem.get(stem)
477475
chunk_metadata = metadata_chunks_by_stem.get(stem, [])
478476
if base_metadata is None and not chunk_metadata:
479477
continue
480478
metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata)
481479
metadata = {k: _normalize_value(v) for k, v in metadata.items()}
482480
original_filename_value = metadata.get("filename")
483-
if chunk_paths:
484-
ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key)
485-
parts: List[str] = []
486-
for path in ordered_chunks:
487-
parts.append(path.read_text(encoding="utf-8"))
488-
document_text = "\n".join(parts)
489-
elif representative_path is not None:
490-
document_text = representative_path.read_text(encoding="utf-8")
491-
else:
481+
if base_path is None or not base_path.exists():
492482
continue
493483

484+
document_text = base_path.read_text(encoding="utf-8")
485+
494486
filetype = metadata.get("filetype") or metadata.get("file_ext")
495487
if not filetype:
496488
filename_candidate = original_filename_value or metadata.get("filename")

src/glossapi/gloss_extract.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
813813
except Exception as e:
814814
self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}")
815815

816+
# Always attempt to assemble whatever chunks succeeded (best-effort)
817+
out_md_path = output_dir / f"{stem}.md"
818+
final_md_written = False
819+
if all_segments:
820+
try:
821+
final_md = "\n\n".join(all_segments)
822+
out_md_path.write_text(final_md, encoding="utf-8")
823+
final_md_written = True
824+
except Exception as e:
825+
self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}")
826+
816827
if not completed:
817828
# Record failure/timeout provenance in parquet
818829
try:
@@ -827,6 +838,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
827838
chunk_size=self.chunk_size,
828839
chunk_count=len(manifest.get("entries", [])),
829840
chunk_manifest_path=manifest_path,
841+
no_partial_output=not final_md_written,
830842
)
831843
except Exception as e:
832844
self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}")
@@ -838,14 +850,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir:
838850
self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}")
839851
return False
840852

841-
# Assemble final markdown
842-
try:
843-
final_md = "\n\n".join(all_segments)
844-
out_md_path = output_dir / f"{stem}.md"
845-
with out_md_path.open("w", encoding="utf-8") as fp:
846-
fp.write(final_md)
847-
except Exception as e:
848-
self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}")
853+
if not final_md_written:
849854
return False
850855
# Record success provenance in parquet
851856
try:

tests/test_jsonl_export.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path):
458458
assert len(seen_doc_ids) == len(texts)
459459

460460

461+
def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path):
462+
corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks")
463+
464+
base_text = "## Base Title\n\nMerged body from extraction."
465+
base_path = corpus.cleaned_markdown_dir / "chunked.md"
466+
base_path.parent.mkdir(parents=True, exist_ok=True)
467+
base_path.write_text(base_text, encoding="utf-8")
468+
469+
chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked"
470+
chunk_dir.mkdir(parents=True, exist_ok=True)
471+
(chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8")
472+
(chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8")
473+
474+
_write_download_results(
475+
corpus.output_dir / "download_results" / "download_results.parquet",
476+
[
477+
{
478+
"filename": "chunked.pdf",
479+
"filter": "ok",
480+
"needs_ocr": False,
481+
"is_empty": False,
482+
"char_count_no_comments": 10,
483+
}
484+
],
485+
)
486+
487+
out_path = corpus.output_dir / "chunked.jsonl"
488+
corpus.jsonl(out_path)
489+
490+
record = json.loads(out_path.read_text(encoding="utf-8").strip())
491+
assert record["document"] == base_text
492+
493+
461494
@pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed")
462495
def test_hf_streaming_loader_example(tmp_path):
463496
corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7")
@@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path):
531564
table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019))
532565

533566
assert set(table.column("doc_id").to_pylist()) == {"a"}
567+
534568
def _expected_doc_id(filename: str) -> str:
535569
return hashlib.sha256(filename.encode("utf-8")).hexdigest()

0 commit comments

Comments
 (0)