uhh-lt
diff --git a/‎backend/src/modules/classifier/classifier_dto.py‎
Lines changed: 7 additions & 0 deletions b/‎backend/src/modules/classifier/classifier_dto.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backend/src/modules/classifier/classifier_service.py‎
Lines changed: 7 additions & 3 deletions b/‎backend/src/modules/classifier/classifier_service.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎backend/src/modules/classifier/models/doc_class_model_service.py‎
Lines changed: 22 additions & 14 deletions b/‎backend/src/modules/classifier/models/doc_class_model_service.py‎
Lines changed: 22 additions & 14 deletions
diff --git a/‎backend/src/modules/classifier/models/sent_class_model_service.py‎
Lines changed: 24 additions & 19 deletions b/‎backend/src/modules/classifier/models/sent_class_model_service.py‎
Lines changed: 24 additions & 19 deletions
@@ -2,6 +2,7 @@
 from enum import Enum
 from typing import Any, Literal
 
+from lightning_fabric.plugins.precision.precision import _PRECISION_INPUT
 from pydantic import BaseModel, ConfigDict, Field
 
 from repos.db.dto_base import UpdateDTOBase
@@ -121,6 +122,12 @@ class ClassifierTrainingParams(BaseModel):
     learning_rate: float = Field(description="Learning rate to use for training")
     weight_decay: float = Field(description="Weight decay to use for training")
     dropout: float = Field(description="Dropout rate to use in the model")
+    chunk_size: int | None = Field(
+        description="Slice long documents into chunks of size x"
+    )
+    precision: _PRECISION_INPUT | None = Field(
+        description="Precision, e.g. 32-true, 16-mixed, 16-true, bf16-true, bf16-mixed"
+    )
     # specific training settings
     is_bio: bool = Field(description="Whether to use BIO or IO tagging")
 
 
@@ -65,9 +65,13 @@ def handle_classifier_job(
             status_message="Started ClassifierJob!",
         )
 
-        # free GPU memory before job starts
-        gc.collect()
-        torch.cuda.empty_cache()
+        gpu_mem_limit_gb = 20
+        gpu_mem_limit_bytes = gpu_mem_limit_gb * 1024 * 1024 * 1024
+
+        # set GPU memory limit for job
+        total = torch.cuda.get_device_properties().total_memory
+        allowed_fraction = gpu_mem_limit_bytes / total
+        torch.cuda.set_per_process_memory_fraction(allowed_fraction)
 
         # get the correct classifier service
         tcs: TextClassificationModelService
 
@@ -149,7 +149,7 @@ def training_step(self, batch: dict[str, Any], batch_idx: int) -> torch.Tensor:
         # labels = batch["labels"]
         # loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
 
-        self.log("train_loss", loss, on_step=False, on_epoch=True)
+        self.log("train_loss", loss.detach(), on_step=False, on_epoch=True)
         return loss
 
     def _val_test_step(
@@ -186,20 +186,23 @@ def _val_test_step(
         )
         return outputs.loss
 
+    @torch.no_grad()
     def validation_step(self, batch: dict[str, Any], batch_idx: int) -> torch.Tensor:
         return self._val_test_step(
             prefix="eval",
             batch=batch,
             batch_idx=batch_idx,
         )
 
+    @torch.no_grad()
     def test_step(self, batch: dict[str, Any], batch_idx: int) -> torch.Tensor:
         return self._val_test_step(
             prefix="test",
             batch=batch,
             batch_idx=batch_idx,
         )
 
+    @torch.no_grad()
     def predict_step(self, batch: dict[str, Any], batch_idx: int) -> Any:
         outputs = self.model(
             input_ids=batch["input_ids"],
@@ -214,7 +217,10 @@ def predict_step(self, batch: dict[str, Any], batch_idx: int) -> Any:
 
     def configure_optimizers(self) -> torch.optim.Optimizer:
         optimizer = torch.optim.AdamW(
-            self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
+            fused=True,
         )
         return optimizer
 
@@ -399,17 +405,6 @@ def train(
 
         # 2. Initialize PyTorch Lightning components
         job.update(current_step=2)
-        # Initialize the Lightning Model
-        lightning_model = DocClassificationLightningModel(
-            base_name=parameters.base_name,
-            num_labels=len(classid2labelid),
-            dropout=parameters.dropout,
-            learning_rate=parameters.learning_rate,
-            weight_decay=parameters.weight_decay,
-            class_weights=torch.tensor(class_weights, dtype=torch.float32),
-            id2label=id2label,
-            label2id={v: k for k, v in id2label.items()},
-        )
 
         # Create the Trainer
         model_name: str = str(uuid4())
@@ -447,11 +442,24 @@ def train(
             max_epochs=parameters.epochs,
             callbacks=callbacks,
             enable_progress_bar=True,
+            precision=parameters.precision,
             # Special params
-            # precision=32,  # full precision training
             # gradient_clip_val=1.0,  # Gradient clipping
         )
 
+        with trainer.init_module():
+            # Initialize the Lightning Model
+            lightning_model = DocClassificationLightningModel(
+                base_name=parameters.base_name,
+                num_labels=len(classid2labelid),
+                dropout=parameters.dropout,
+                learning_rate=parameters.learning_rate,
+                weight_decay=parameters.weight_decay,
+                class_weights=torch.tensor(class_weights, dtype=torch.float32),
+                id2label=id2label,
+                label2id={v: k for k, v in id2label.items()},
+            )
+
         # 3. Train the model
         job.update(current_step=3)
         trainer.fit(
 
@@ -225,7 +225,10 @@ def predict_step(self, batch: dict[str, Any], batch_idx: int) -> Any:
 
     def configure_optimizers(self) -> torch.optim.Optimizer:
         optimizer = torch.optim.AdamW(
-            self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
+            fused=True,
         )
         return optimizer
 
@@ -485,23 +488,6 @@ def train(
 
         # 2. Initialize PyTorch Lightning components
         job.update(current_step=2)
-        # Initialize the Lightning Model
-        lightning_model = SentClassificationLightningModel(
-            # embedding model params
-            embedding_model_name=parameters.base_name,
-            embedding_dim=embedding_dim,
-            # sent classifier specific params
-            hidden_dim=int(embedding_dim / 2),
-            use_lstm=True,
-            # training params
-            num_labels=len(classid2labelid),
-            dropout=parameters.dropout,
-            learning_rate=parameters.learning_rate,
-            weight_decay=parameters.weight_decay,
-            class_weights=torch.tensor(class_weights, dtype=torch.float32),
-            id2label=id2label,
-            label2id={v: k for k, v in id2label.items()},
-        )
 
         # Create the Trainer
         model_name: str = str(uuid4())
@@ -539,11 +525,30 @@ def train(
             max_epochs=parameters.epochs,
             callbacks=callbacks,
             enable_progress_bar=True,
+            precision=parameters.precision,
             # Special params
-            # precision=32,  # full precision training
             # gradient_clip_val=1.0,  # Gradient clipping
         )
 
+        with trainer.init_module():
+            # Initialize the Lightning Model
+            lightning_model = SentClassificationLightningModel(
+                # embedding model params
+                embedding_model_name=parameters.base_name,
+                embedding_dim=embedding_dim,
+                # sent classifier specific params
+                hidden_dim=int(embedding_dim / 2),
+                use_lstm=True,
+                # training params
+                num_labels=len(classid2labelid),
+                dropout=parameters.dropout,
+                learning_rate=parameters.learning_rate,
+                weight_decay=parameters.weight_decay,
+                class_weights=torch.tensor(class_weights, dtype=torch.float32),
+                id2label=id2label,
+                label2id={v: k for k, v in id2label.items()},
+            )
+
         # 3. Train the model
         job.update(current_step=3)
         trainer.fit(