swisstopo
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.vscode/launch.json.template.jsonc‎
Lines changed: 29 additions & 0 deletions b/‎.vscode/launch.json.template.jsonc‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎CLASSIFICATION.md‎
Lines changed: 122 additions & 0 deletions b/‎CLASSIFICATION.md‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎api/app/v1/schemas.py‎
Lines changed: 17 additions & 11 deletions b/‎api/app/v1/schemas.py‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎config/xgboost_config.yml‎
Lines changed: 2 additions & 3 deletions b/‎config/xgboost_config.yml‎
Lines changed: 2 additions & 3 deletions
@@ -22,7 +22,8 @@ minio
 
 # IDE config
 .idea/
-.vscode/
+.vscode/*
+!.vscode/launch.json.template.jsonc
 
 # Package metadata
 *.egg-info/
 
@@ -0,0 +1,29 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name":"Python: Run dataset split (single pages)",
+            "type":"debugpy",
+            "request":"launch",
+            "program":"src/scripts/split_data.py",
+            "console":"integratedTerminal",
+            "args": ["-i", "data/single_pages", "-o", "data/single_pages_splits", "-rv", "0.2", "-rt", "0.0"],
+            "python": "${workspaceFolder}/venv/bin/python3",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}"
+            }
+        },
+        {
+            "name":"Python: Train XGBoost classsification (single pages)",
+            "type":"debugpy",
+            "request":"launch",
+            "program":"src/models/treebased/train.py",
+            "console":"integratedTerminal",
+            "args": ["--config-file-path", "config/xgboost_config.yml", "--out-directory", "models/xgboost_model"],
+            "python": "${workspaceFolder}/venv/bin/python3",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}"
+            }
+        }
+    ]
+}
@@ -0,0 +1,122 @@
+# XGBoost Classification
+
+To train the classification, the development package needs to be installed and MLflow tracking activated.
+
+The dataset used to train the provided model (`models/stable/model.joblib`) is internal and not publicly available. It is stored in a private S3 bucket (`stijnvermeeren-assets-data`) accessible only to the project team. The dataset is composed of 1011 labeled single-page PDF across 9 classes, with ground truth available under `data/gt_single_pages_2026.json`. The distribution of the pages is listed below.
+
+| Class           | Number | Percentage |
+|-----------------|-------:|-----------:|
+| boreprofile     |    115 |       13.4 |
+| diagram         |    106 |       10.5 |
+| geo_profile     |     74 |        7.3 |
+| map             |    126 |       12.5 |
+| section_header  |     93 |        9.2 |
+| table           |     60 |        5.9 |
+| text            |    202 |       20.0 |
+| title_page      |    109 |       10.8 |
+| unknown         |    126 |       12.5 |
+
+
+The classification results on the validation set are reported below.
+
+| Class           | Precision | Recall | F1-score |
+|-----------------|----------:|-------:|---------:|
+| boreprofile     |      96.7 |   87.9 |     92.1 |
+| diagram         |      84.6 |   84.6 |     84.6 |
+| geo_profile     |      55.6 |   71.4 |     62.5 |
+| map             |      63.6 |   80.8 |     71.2 |
+| section_header  |      64.7 |   73.3 |     68.8 |
+| table           |      90.9 |   83.3 |     87.0 |
+| text            |      84.4 |   88.4 |     86.4 |
+| title_page      |      95.0 |   95.0 |     95.0 |
+| unknown         |      57.9 |   39.3 |     46.8 |
+| Overall (macro) |      77.0 |   78.2 |     77.1 |
+
+
+## Train with your own data
+
+### 1. Prepare the folder structure
+
+Organize your labeled single-page images with one subfolder per class:
+
+```
+data/single_pages/
+├── boreprofile/
+├── diagram/
+├── geo_profile/
+├── map/
+├── section_header/
+├── table/
+├── text/
+├── title_page/
+└── unknown/
+```
+
+### 2. Prepare the ground truth
+
+The ground truth file is a JSON list of labeled documents. Follow the same format as `data/gt_single_pages.json`:
+
+```jsonc
+[
+  {
+    "filename": "24911_1.pdf",       // file name relative to train / validation folder
+    "metadata": {
+      "page_count": 1                // total number of pages in the document
+    },
+    "pages": [
+      {
+        "page": 1,                   // page number (1-indexed)
+        "classification": {          // one-hot encoding of the page class
+          "text": 0,
+          "boreprofile": 0,
+          "map": 0,
+          "geo_profile": 0,
+          "title_page": 1,
+          "diagram": 0,
+          "table": 0,
+          "unknown": 0,
+          "section_header": 0
+        }
+      }
+    ]
+  }
+]
+```
+
+### 3. Split into train and validation sets
+
+Split the dataset using an 80-20% ratio based on filename:
+
+```bash
+python src/scripts/split_data.py \
+    -i data/single_pages \
+    -o data/single_pages_splits \
+    -rv 0.2 \
+    -rt 0.0
+```
+
+### 4. Update the config
+
+Edit `config/xgboost_config.yml` to point to your data:
+
+```yaml
+# Path to the training set
+train_folder_path: "data/single_pages_splits/train"
+# Path to the validation set
+val_folder_path: "data/single_pages_splits/validation"
+# Ground truth for model training and validation
+ground_truth_file_path: "data/gt_single_pages.json"
+```
+
+### 5. Train the model
+
+```bash
+python -m src.models.treebased.train \
+    --config-file-path config/xgboost_config.yml \
+    --out-directory models/xgboost_model
+```
+
+The trained model will be saved under `models/xgboost_model`. For macOS users, if you encounter OpenMP issues, install the library via Homebrew first:
+ ```bash
+ brew install libomp
+```
@@ -1,18 +1,22 @@
-from enum import Enum
-from typing import TypeAlias
+from enum import StrEnum
 
 from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_pascal
 
 from src.page_classes import PageClasses
 
-# dynamically created Enum to expose PascalCase class names to the API.
-PascalPageClasses = Enum(
-    "PascalPageClasses",
-    {name: to_pascal(value) for name, value in PageClasses.__members__.items()},
-    type=str,
-)
-PascalPageClasses: TypeAlias = PascalPageClasses  # pyright: ignore[reportInvalidTypeForm]
+
+class PascalPageClasses(StrEnum):
+    """Enum for classifying pages into page types."""
+
+    BOREPROFILE = "Boreprofile"
+    DIAGRAM = "Diagram"
+    GEO_PROFILE = "GeoProfile"
+    MAP = "Map"
+    TABLE = "Table"
+    TEXT = "Text"
+    TITLE_PAGE = "TitlePage"
+    UNKNOWN = "Unknown"
 
 
 class MetaDataSchema(BaseModel):
@@ -63,7 +67,7 @@ class PredictionSchema(BaseModel):
     pages: list[PagePrediction]
 
     @classmethod
-    def from_prediction(cls, prediction: dict[dict]):
+    def from_prediction(cls, prediction: dict):
         return cls(
             filename=prediction["filename"],
             metadata=MetaDataSchema.from_prediction(prediction["metadata"]),
@@ -107,9 +111,11 @@ def create_response(cls, predictions: list[dict]):
 def predicted_class(classification: PageClasses) -> PascalPageClasses:
     """Parse the predicted class from a one-hot encoded classification dictionary.
 
-    The values of the dict are the sting representation of each class in the PageClasses enum.
+    The values of the dict are the string representation of each class in the PageClasses enum.
     """
     try:
+        # Cast detected pages to Pascal equivalent
         return PascalPageClasses(to_pascal(classification))
     except ValueError:
+        # Other undefined classes such as Section Header
         return PascalPageClasses.UNKNOWN
@@ -1,7 +1,7 @@
 model_type: xgboost
 
 train_folder_path: "data/single_pages_splits/train"
-val_folder_path: "data/single_pages_splits/val"
+val_folder_path: "data/single_pages_splits/validation"
 ground_truth_file_path: "data/gt_single_pages.json"
 
 # Feature names to track (23 features total)
@@ -34,7 +34,6 @@ feature_names:
   - Num Long or Horizontal Lines
   - Text Line Count
 
-
 hyperparameters:
   n_estimators: 600
   max_depth: 6
@@ -51,4 +50,4 @@ tuning:
     max_depth: [4, 5, 6, 7, 8]
     learning_rate: [0.01, 0.03, 0.05, 0.1, 0.2]
     subsample: [0.6, 0.8, 1.0]
-    colsample_bytree: [0.6, 0.8, 1.0]
+    colsample_bytree: [0.6, 0.8, 1.0]