Update pytorch-lightning, pytorch, and docker compose (#34)

Benedikt Mersch · web-flow · commit b04651abb4f7 · 2024-01-16T15:59:52.000+01:00
* Remove deprecated gpus argument * Formatting * Fix epoch collectors * Fix device error * Use latest pytorch and cuda * Need to use docker-compose, see docker/compose#9681 * This works but it's not ideal * Better, at least as long as buildkit does not allow access to the GPU during build. See docker/compose#9681 * Relax these
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
 
 ENV PROJECT=/mos4d
 RUN mkdir -p $PROJECT
@@ -21,7 +21,7 @@ RUN rm -rf $PROJECT
 
 RUN pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps \
                            --install-option="--force_cuda" \
-                           --install-option="--cuda_home=/usr/local/cuda-11.3" \
+                           --install-option="--cuda_home=/usr/local/cuda-11.7" \
                            --install-option="--blas=openblas"
 
 
diff --git a/Makefile b/Makefile
@@ -3,22 +3,22 @@ export GROUP_ID:=$(shell id -g)
 
 build:
 	@echo Build docker image...
-	@docker-compose build project
+	@DOCKER_BUILDKIT=0 docker compose build project
 
 test: check-env
 	@echo NVIDIA and CUDA setup
-	@docker-compose run project nvidia-smi
+	@docker compose run project nvidia-smi
 	@echo Pytorch CUDA setup installed?
-	@docker-compose run project python3 -c "import torch; print(torch.cuda.is_available())"
+	@docker compose run project python3 -c "import torch; print(torch.cuda.is_available())"
 	@echo MinkowskiEngine installed?
-	@docker-compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"
+	@docker compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"
 
 run: check-env
-	@docker-compose run project
+	@docker compose run project
 
 clean:
 	@echo Removing docker image...
-	@docker-compose rm project
+	@docker compose rm project
 
 
 check-env:
diff --git a/scripts/predict_confidences.py b/scripts/predict_confidences.py
@@ -77,7 +77,7 @@ def main(weights, sequence, dt, poses, transform):
     model.freeze()
 
     # Setup trainer
-    trainer = Trainer(gpus=1, logger=False)
+    trainer = Trainer(accelerator="gpu", devices=1, logger=False)
 
     # Infer!
     trainer.predict(model, data.test_dataloader())
diff --git a/scripts/train.py b/scripts/train.py
@@ -39,7 +39,6 @@
     default=None,
 )
 def main(config, weights, checkpoint):
-
     if checkpoint:
         cfg = torch.load(checkpoint)["hyper_parameters"]
     else:
@@ -72,7 +71,8 @@ def main(config, weights, checkpoint):
 
     # Setup trainer
     trainer = Trainer(
-        gpus=1,
+        accelerator="gpu",
+        devices=1,
         logger=tb_logger,
         max_epochs=cfg["TRAIN"]["MAX_EPOCH"],
         accumulate_grad_batches=cfg["TRAIN"]["ACC_BATCHES"],
diff --git a/setup.py b/setup.py
@@ -8,12 +8,11 @@
     description="Receding Moving Object Segmentation in 3D LiDAR Data Using Sparse 4D Convolutions",
     packages=find_packages(where="src"),
     install_requires=[
-        "Click>=7.0",
-        "numpy>=1.20.3",
-        "pytorch_lightning>=1.6.4",
-        "PyYAML>=6.0",
-        "tqdm>=4.62.3",
-        "torch",
-        "ninja",
+        "Click",
+        "numpy",
+        "pytorch_lightning",
+        "tensorboard",
+        "PyYAML",
+        "tqdm",
     ],
 )
diff --git a/src/mos4d/datasets/datasets.py b/src/mos4d/datasets/datasets.py
@@ -212,7 +212,6 @@ def __getitem__(self, idx):
         past_files = self.filenames[seq][from_idx : to_idx : self.skip]
         list_past_point_clouds = [self.read_point_cloud(f) for f in past_files]
         for i, pcd in enumerate(list_past_point_clouds):
-
             # Transform to current viewpoint
             if self.transform:
                 from_pose = self.poses[seq][past_indices[i]]
diff --git a/src/mos4d/models/MinkowskiEngine/resnet.py b/src/mos4d/models/MinkowskiEngine/resnet.py
@@ -55,7 +55,6 @@ def __init__(self, in_channels, out_channels, D=3):
         self.weight_initialization()
 
     def network_initialization(self, in_channels, out_channels, D):
-
         self.inplanes = self.INIT_DIM
         self.conv1 = nn.Sequential(
             ME.MinkowskiConvolution(
diff --git a/src/mos4d/models/metrics.py b/src/mos4d/models/metrics.py
@@ -15,7 +15,6 @@ def __init__(self, n_classes, ignore_index):
         self.ignore_index = ignore_index
 
     def compute_confusion_matrix(self, pred_logits: torch.Tensor, gt_labels: torch.Tensor):
-
         # Set ignored classes to -inf to not influence softmax
         pred_logits[:, self.ignore_index] = -float("inf")
 
diff --git a/src/mos4d/models/models.py b/src/mos4d/models/models.py
@@ -47,6 +47,9 @@ def __init__(self, hparams: dict):
 
         self.ClassificationMetrics = ClassificationMetrics(self.n_classes, self.ignore_index)
 
+        self.training_step_outputs = []
+        self.validation_step_outputs = []
+
     def getLoss(self, out: ME.TensorField, past_labels: list):
         loss = self.MOSLoss.compute_loss(out, past_labels)
         return loss
@@ -70,20 +73,20 @@ def training_step(self, batch: tuple, batch_idx, dataloader_index=0):
                 self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
             )
 
+        self.training_step_outputs.append(dict_confusion_matrix)
         torch.cuda.empty_cache()
-        return {"loss": loss, "dict_confusion_matrix": dict_confusion_matrix}
 
-    def training_epoch_end(self, training_step_outputs):
-        list_dict_confusion_matrix = [
-            output["dict_confusion_matrix"] for output in training_step_outputs
-        ]
+        return loss
+
+    def on_train_epoch_end(self):
         for s in range(self.n_past_steps):
             agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
-            for dict_confusion_matrix in list_dict_confusion_matrix:
+            for dict_confusion_matrix in self.training_step_outputs:
                 agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
             iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
             self.log("train_moving_iou_step{}".format(s), iou[2].item())
 
+        self.training_step_outputs.clear()
         torch.cuda.empty_cache()
 
     def validation_step(self, batch: tuple, batch_idx):
@@ -101,17 +104,18 @@ def validation_step(self, batch: tuple, batch_idx):
                 self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
             )
 
+        self.validation_step_outputs.append(dict_confusion_matrix)
         torch.cuda.empty_cache()
-        return dict_confusion_matrix
 
-    def validation_epoch_end(self, validation_step_outputs):
+    def on_validation_epoch_end(self):
         for s in range(self.n_past_steps):
             agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
-            for dict_confusion_matrix in validation_step_outputs:
+            for dict_confusion_matrix in self.validation_step_outputs:
                 agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
             iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
             self.log("val_moving_iou_step{}".format(s), iou[2].item())
 
+        self.validation_step_outputs.clear()
         torch.cuda.empty_cache()
 
     def predict_step(self, batch: tuple, batch_idx: int, dataloader_idx: int = None):
@@ -163,8 +167,8 @@ def get_step_confusion_matrix(self, out, past_labels, step):
         t = round(-step * self.dt_prediction, 3)
         mask = out.coordinates[:, -1].isclose(torch.tensor(t))
         pred_logits = out.features[mask].detach().cpu()
-        gt_labels = torch.cat(past_labels, dim=0).detach().cpu()
-        gt_labels = gt_labels[mask][:, 0]
+        gt_labels = torch.cat(past_labels, dim=0)
+        gt_labels = gt_labels[mask][:, 0].detach().cpu()
         confusion_matrix = self.ClassificationMetrics.compute_confusion_matrix(
             pred_logits, gt_labels
         )