review feedback

fynnos · bigabig · commit 60f878e2fa2f · 2025-10-06T14:13:52.000+02:00
diff --git a/backend/configs/development.yaml b/backend/configs/development.yaml
@@ -112,6 +112,9 @@ vllm:
     port: ${oc.env:VLLM_EMB_PORT, 8000}
     model: ${oc.env:VLLM_EMB_MODEL, snowflake-arctic-embed2:568m}
 
+job:
+  gpu_memory_limit: ${oc.env:RQ_WORKER_GPU_MEM_GB, 20}
+
 llm_assistant:
   sentence_annotation:
     few_shot_threshold: 4
diff --git a/backend/configs/production.yaml b/backend/configs/production.yaml
@@ -112,6 +112,9 @@ vllm:
     port: ${oc.env:VLLM_EMB_PORT, 8000}
     model: ${oc.env:VLLM_EMB_MODEL, snowflake-arctic-embed2:568m}
 
+job:
+  gpu_memory_limit: ${oc.env:RQ_WORKER_GPU_MEM_GB, 20}
+
 llm_assistant:
   sentence_annotation:
     few_shot_threshold: 4
diff --git a/backend/src/modules/classifier/classifier_service.py b/backend/src/modules/classifier/classifier_service.py
@@ -65,14 +65,6 @@ def handle_classifier_job(
             status_message="Started ClassifierJob!",
         )
 
-        gpu_mem_limit_gb = 20
-        gpu_mem_limit_bytes = gpu_mem_limit_gb * 1024 * 1024 * 1024
-
-        # set GPU memory limit for job
-        total = torch.cuda.get_device_properties().total_memory
-        allowed_fraction = gpu_mem_limit_bytes / total
-        torch.cuda.set_per_process_memory_fraction(allowed_fraction)
-
         # get the correct classifier service
         tcs: TextClassificationModelService
         match payload.model_type:
diff --git a/backend/src/modules/classifier/models/doc_class_model_service.py b/backend/src/modules/classifier/models/doc_class_model_service.py
@@ -77,7 +77,7 @@ def __init__(
         dropout: float,
         learning_rate: float,
         weight_decay: float,
-        class_weights: torch.Tensor,
+        class_weights: list[float],
         id2label: dict[int, str] | None = None,
         label2id: dict[str, int] | None = None,
     ):
@@ -455,7 +455,7 @@ def train(
                 dropout=parameters.dropout,
                 learning_rate=parameters.learning_rate,
                 weight_decay=parameters.weight_decay,
-                class_weights=torch.tensor(class_weights, dtype=torch.float32),
+                class_weights=class_weights,
                 id2label=id2label,
                 label2id={v: k for k, v in id2label.items()},
             )
diff --git a/backend/src/modules/classifier/models/sent_class_model_service.py b/backend/src/modules/classifier/models/sent_class_model_service.py
@@ -75,7 +75,7 @@ def __init__(
         dropout: float,
         learning_rate: float,
         weight_decay: float,
-        class_weights: torch.Tensor,
+        class_weights: list[float],
         # special params
         embedding_model_name: str,
         embedding_dim: int,
@@ -200,20 +200,23 @@ def _val_test_step(self, prefix: str, batch, batch_idx: int) -> torch.Tensor:
         )
         return loss
 
+    @torch.no_grad()
     def validation_step(self, batch, batch_idx):
         return self._val_test_step(
             prefix="eval",
             batch=batch,
             batch_idx=batch_idx,
         )
 
+    @torch.no_grad()
     def test_step(self, batch, batch_idx):
         return self._val_test_step(
             prefix="test",
             batch=batch,
             batch_idx=batch_idx,
         )
 
+    @torch.no_grad()
     def predict_step(self, batch: dict[str, Any], batch_idx: int) -> Any:
         # Get predictions and ground truth tags
         predictions = self(sentences=batch["sentences"], mask=batch["mask"])
@@ -544,7 +547,7 @@ def train(
                 dropout=parameters.dropout,
                 learning_rate=parameters.learning_rate,
                 weight_decay=parameters.weight_decay,
-                class_weights=torch.tensor(class_weights, dtype=torch.float32),
+                class_weights=class_weights,
                 id2label=id2label,
                 label2id={v: k for k, v in id2label.items()},
             )
diff --git a/backend/src/modules/classifier/models/span_class_model_service.py b/backend/src/modules/classifier/models/span_class_model_service.py
@@ -523,7 +523,6 @@ def split_in_chunks(examples: dict):
         callbacks.append(JobProgressCallback(job=job))
 
         trainer = pl.Trainer(
-            accelerator="gpu",
             logger=csv_logger,
             max_epochs=parameters.epochs,
             callbacks=callbacks,
diff --git a/backend/src/systems/job_system/job_handler.py b/backend/src/systems/job_system/job_handler.py
@@ -1,12 +1,13 @@
 from datetime import datetime
 
-from common.gpu_utils import find_unused_cuda_device
 from common.job_type import JobType
+from config import conf
 from modules.doc_processing.doc_processing_pipeline import (
     handle_job_error,
     handle_job_finished,
 )
 from systems.job_system.job_dto import Job, JobInputBase
+from utils.gpu_utils import find_unused_cuda_device, set_cuda_memory_limit
 
 
 def rq_job_handler(jobtype: JobType, handler, payload: JobInputBase):
@@ -18,6 +19,7 @@ def rq_job_handler(jobtype: JobType, handler, payload: JobInputBase):
 
             cuda_device = find_unused_cuda_device()
             with torch.cuda.device(cuda_device):
+                set_cuda_memory_limit(conf.job.gpu_memory_limit)
                 output = handler(payload=payload, job=job)
         else:
             output = handler(payload=payload, job=job)
diff --git a/backend/src/utils/gpu_utils.py b/backend/src/utils/gpu_utils.py
@@ -32,6 +32,17 @@ def find_unused_cuda_device() -> str:
         return "cuda:0"
 
 
+def set_cuda_memory_limit(limit_gb: int):
+    import torch
+
+    limit_bytes = limit_gb * 1024 * 1024 * 1024
+
+    # set GPU memory limit for job
+    total = torch.cuda.get_device_properties().total_memory
+    allowed_fraction = limit_bytes / total
+    torch.cuda.set_per_process_memory_fraction(allowed_fraction)
+
+
 def parse_device_string(device_str: str) -> tuple[str, list[int]]:
     """
     Parses a device string and returns the appropriate device configuration.
diff --git a/docker/.env.example b/docker/.env.example
@@ -106,6 +106,7 @@ RQ_WORKERS_CPU=8
 RQ_WORKERS_API=16
 RQ_WORKERS_GPU=1
 RQ_DEVICE_IDS=1
+RQ_WORKER_GPU_MEM_GB=20
 # NUM DB CONNECTIONS
 RQ_POSTGRES_POOL_SIZE=1
 RQ_POSTGRES_MAX_OVERFLOW=1