swisstopo
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎api/app/v2/schemas.py‎
Lines changed: 2 additions & 1 deletion b/‎api/app/v2/schemas.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎config/local_matching_params.yml‎
Lines changed: 9 additions & 11 deletions b/‎config/local_matching_params.yml‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎main.py‎
Lines changed: 67 additions & 30 deletions b/‎main.py‎
Lines changed: 67 additions & 30 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -98,9 +98,10 @@ The output of the pipeline is dependent of the version queried.
 		"entities": [								// List of elements present in file
 			{
 				"classification": "boreprofile",	// Type of element (PageClasses)
+				"language": "de",					// Detected language
 				"page_start": 1,					// Starting page
 				"page_end": 3,						// Ending page
-				"language": "de"			    	// Detected language
+				"title": "BS1"						// Entity title (None if not found)
 			}
 		]
 	}
 
@@ -20,9 +20,10 @@ class CollectResponse(BaseModel):
                     "entities": [
                         {
                             "classification": "boreprofile",
+                            "language": "de",
                             "page_start": 1,
                             "page_end": 3,
-                            "language": "de",
+                            "title": "BS1",
                         },
                     ],
                 },
 
@@ -1,4 +1,3 @@
-
 table_of_contents:
   en:
     - table of contents
@@ -33,20 +32,20 @@ caption_description:
   boreprofile:
     en:
       must_contain:
-        - [ profile, log, lithostratigraphy ]
-        - [ borehole, drilling, bore ]
+        - [profile, log, lithostratigraphy]
+        - [borehole, drilling, bore]
     de:
       must_contain:
-        - [ profil, log, lithostratigraphie]
-        - [ bohrung, bohrloch, bohr ]
+        - [profil, log, lithostratigraphie]
+        - [bohrung, bohrloch, bohr]
     fr:
       must_contain:
-        - [ profil, log, lithostratigraphique ]
-        - [ forage, sondage ]
+        - [profil, log, lithostratigraphique]
+        - [forage, sondage]
     it:
       must_contain:
-        - [ profilo, log, stratigrafico, diagramma ]
-        - [ perforazione, sondaggio ]
+        - [profilo, log, stratigrafico, diagramma]
+        - [perforazione, sondaggio]
 
 boreprofile:
   en:
@@ -173,7 +172,6 @@ geo_profile:
     - sezione verticale
     - taglio geologico
 
-
 diagram:
   en:
     - diagram
@@ -245,4 +243,4 @@ open_ended_depth_key:
   - à partir de
   - from
   - starting at
-  - a partire
+  - a partire
@@ -8,8 +8,10 @@
 from dotenv import load_dotenv
 from swissgeol_doc_processing.utils.file_utils import read_params as swissgeol_read_params
 
+from src.boreprofile.entity_parser import document_to_boreprofiles
 from src.classifiers.classifier_factory import ClassifierTypes, create_classifier
 from src.constants import DEFAULT_TREEBASED_MODEL_PATH
+from src.page_classes import PageClasses
 from src.page_structure import (
     ProcessedEntities,
     ProcessorDocument,
@@ -92,7 +94,7 @@ def forward_document(
     """Infer document classes.
 
     Args:
-        pdf_files (list[Path]): List fo documents to classify.
+        pdf_files (list[Path]): List of documents to classify.
         matching_params (dict): Dict of parameters for document processing.
         borehole_matching_params (dict): Dict of parameters for borehole matching.
         model_path (str, optional): Path to pretrained model.
@@ -114,13 +116,45 @@ def forward_document(
     return processor.process_batch(pdf_files)
 
 
+def forward_document_entities_group(
+    classification: PageClasses,
+    page_start: int,
+    page_end: int,
+    language: str | None,
+    pdf_file: Path,
+) -> list[ProcessedEntities]:
+    """Extract entities from a group of consecutive pages with the same classification.
+
+    Args:
+        classification (PageClasses): The classification type of the page group.
+        page_start (int): First page index in the consecutive group (1-based).
+        page_end (int): Last page index in the consecutive group (1-based).
+        language (str): Detected language of the page group.
+        pdf_file (Path): Path to the source PDF file.
+
+    Returns:
+        list[ProcessedEntities]: Extracted entities from the page group.
+    """
+    if classification == PageClasses.BOREPROFILE:
+        return document_to_boreprofiles(pdf_file=pdf_file, page_start=page_start, page_end=page_end, lang=language)
+    else:
+        return [
+            ProcessedEntities(
+                classification=classification,
+                page_start=page_start,
+                page_end=page_end,
+                language=language,
+            )
+        ]
+
+
 def forward_document_entities(
     documents: list[ProcessorDocument],
 ) -> list[ProcessorDocumentEntities]:
     """Convert classified documents pages to entities.
 
     Args:
-        documents (list[ProcessorDocument]): List of documents to process.
+        documents (list[ProcessorDocument]): List of documents to convert to entities.
 
     Returns:
        list[ProcessorDocumentEntities]: Processed documents entities
@@ -132,19 +166,20 @@ def forward_document_entities(
         # Iterate over grouped entities types
         for (pages_type, lang), pages in document.group_pages_by_type():
             # Get pages sequences
-            pages_id = sorted([page.page for page in pages])
-            results_entities.extend(
-                [
-                    ProcessedEntities(
-                        classification=pages_type,
-                        page_start=min(pages_group),
-                        page_end=max(pages_group),
-                        language=lang,
-                    )
-                    # Group consecutive [1,2,10] -> [1,2], [10]
-                    for pages_group in group_consecutive(pages_id)
-                ]
-            )
+            page_numbers = sorted([page.page for page in pages])
+            entities = [
+                entity
+                for pages_group in group_consecutive(page_numbers)  # Group consecutive [1,2,10] -> [1,2], [10]
+                for entity in forward_document_entities_group(
+                    classification=pages_type,
+                    page_start=min(pages_group),
+                    page_end=max(pages_group),
+                    language=lang,
+                    pdf_file=document.path,
+                )
+            ]
+            # Extend entry
+            results_entities.extend(entities)
         # Create document from filename, metadata, entities
         documents_entities.append(
             ProcessorDocumentEntities(
@@ -166,20 +201,20 @@ def main(
     write_result: bool = False,
     explain_model: bool = False,
     return_entities: bool = False,
-) -> tuple[list[ProcessorDocument] | list[ProcessorDocumentEntities]]:
+) -> list[ProcessorDocument] | list[ProcessorDocumentEntities]:
     """Run the page classification pipeline on input documents.
 
     Args:
         input_path (str): Path to directory with PDF pages or documents.
         ground_truth_path (str, optional): Path to ground truth JSON file for evaluation.
         model_path (str, optional): Path to pretrained model.
         classifier_name (str, optional): Classifier to use ("treebased", "pixtral", etc.).
-        write_result (bool): If True, writes results to prediction.json.
+        write_result (bool): If True, and return_entities is True, writes results to prediction.json.
         explain_model (bool): If True, generates plots to explain the model's choices.
         return_entities (bool): If True, return grouped entities instead of per-page results.
 
-    Return:
-        tuple[list[ProcessorDocument] | list[ProcessorDocumentEntities]]:
+    Returns:
+        list[ProcessorDocument] | list[ProcessorDocumentEntities]::
             * A list of `ProcessorDocument` containing per-page classifications, or
             * A list of `ProcessorDocumentEntities` containing grouped (multi-page) entities
             when `return_entities=True`.
@@ -200,7 +235,7 @@ def main(
     pdf_files = get_pdf_files(input_path)
     if not pdf_files:
         logger.error("No valid PDFs found.")
-        return [], []
+        return []
 
     # Run individual page classification
     documents_pages = forward_document(
@@ -212,15 +247,6 @@ def main(
         explain_model=explain_model,
     )
 
-    # Check if data need to be saved
-    if write_result:
-        output_file = Path("data") / "prediction.json"
-        output_file.parent.mkdir(parents=True, exist_ok=True)
-        output_file.write_text(
-            json.dumps([r.model_dump() for r in documents_pages], indent=4),
-            encoding="utf-8",
-        )
-
     # Check if GT need to be computed
     if ground_truth_path:
         from src.evaluation import evaluate_results
@@ -236,7 +262,17 @@ def main(
     if not return_entities:
         return documents_pages
     else:
-        return forward_document_entities(documents=documents_pages)
+        entities = forward_document_entities(documents=documents_pages)
+
+        # Check if data needs to be saved
+        if write_result:
+            output_file = Path("data") / "prediction.json"
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            output_file.write_text(
+                json.dumps([r.model_dump() for r in entities], indent=4),
+                encoding="utf-8",
+            )
+        return entities
 
 
 if __name__ == "__main__":
@@ -301,4 +337,5 @@ def main(
         classifier_name=args.classifier,
         write_result=args.write_results,
         explain_model=args.explain_model,
+        return_entities=True,
     )
@@ -11,7 +11,7 @@ requires-python = ">=3.11,<3.14"
 
 # Production-only dependencies for API runtime
 dependencies = [
-    "swissgeol-boreholes-dataextraction @ https://github.com/swisstopo/swissgeol-boreholes-dataextraction/releases/download/v1.0.138/swissgeol_boreholes_dataextraction-1.0.138-py3-none-any.whl",
+    "swissgeol-boreholes-dataextraction @ https://github.com/swisstopo/swissgeol-boreholes-dataextraction/releases/download/v1.0.143/swissgeol_boreholes_dataextraction-1.0.143-py3-none-any.whl",
     "boto3==1.40.12",
     "fastapi==0.116.1",
     "boto3-stubs[s3]>=1.40.12,<2.0.0",
Original file line number	Diff line number	Diff line change
`@@ -98,9 +98,10 @@ The output of the pipeline is dependent of the version queried.`
`98`	`98`	`"entities": [ // List of elements present in file`
`99`	`99`	`{`
`100`	`100`	`"classification": "boreprofile", // Type of element (PageClasses)`
	`101`	`+ "language": "de", // Detected language`
`101`	`102`	`"page_start": 1, // Starting page`
`102`	`103`	`"page_end": 3, // Ending page`
`103`		`- "language": "de" // Detected language`
	`104`	`+ "title": "BS1" // Entity title (None if not found)`
`104`	`105`	`}`
`105`	`106`	`]`
`106`	`107`	`}`