Skip to content

Commit b04651a

Browse files
author
Benedikt Mersch
authored
Update pytorch-lightning, pytorch, and docker compose (#34)
* Remove deprecated gpus argument * Formatting * Fix epoch collectors * Fix device error * Use latest pytorch and cuda * Need to use docker-compose, see docker/compose#9681 * This works but it's not ideal * Better, at least as long as buildkit does not allow access to the GPU during build. See docker/compose#9681 * Relax these
1 parent 5e45832 commit b04651a

File tree

9 files changed

+32
-32
lines changed

9 files changed

+32
-32
lines changed

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
1+
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
22

33
ENV PROJECT=/mos4d
44
RUN mkdir -p $PROJECT
@@ -21,7 +21,7 @@ RUN rm -rf $PROJECT
2121

2222
RUN pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps \
2323
--install-option="--force_cuda" \
24-
--install-option="--cuda_home=/usr/local/cuda-11.3" \
24+
--install-option="--cuda_home=/usr/local/cuda-11.7" \
2525
--install-option="--blas=openblas"
2626

2727

Makefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,22 @@ export GROUP_ID:=$(shell id -g)
33

44
build:
55
@echo Build docker image...
6-
@docker-compose build project
6+
@DOCKER_BUILDKIT=0 docker compose build project
77

88
test: check-env
99
@echo NVIDIA and CUDA setup
10-
@docker-compose run project nvidia-smi
10+
@docker compose run project nvidia-smi
1111
@echo Pytorch CUDA setup installed?
12-
@docker-compose run project python3 -c "import torch; print(torch.cuda.is_available())"
12+
@docker compose run project python3 -c "import torch; print(torch.cuda.is_available())"
1313
@echo MinkowskiEngine installed?
14-
@docker-compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"
14+
@docker compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"
1515

1616
run: check-env
17-
@docker-compose run project
17+
@docker compose run project
1818

1919
clean:
2020
@echo Removing docker image...
21-
@docker-compose rm project
21+
@docker compose rm project
2222

2323

2424
check-env:

scripts/predict_confidences.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def main(weights, sequence, dt, poses, transform):
7777
model.freeze()
7878

7979
# Setup trainer
80-
trainer = Trainer(gpus=1, logger=False)
80+
trainer = Trainer(accelerator="gpu", devices=1, logger=False)
8181

8282
# Infer!
8383
trainer.predict(model, data.test_dataloader())

scripts/train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
default=None,
4040
)
4141
def main(config, weights, checkpoint):
42-
4342
if checkpoint:
4443
cfg = torch.load(checkpoint)["hyper_parameters"]
4544
else:
@@ -72,7 +71,8 @@ def main(config, weights, checkpoint):
7271

7372
# Setup trainer
7473
trainer = Trainer(
75-
gpus=1,
74+
accelerator="gpu",
75+
devices=1,
7676
logger=tb_logger,
7777
max_epochs=cfg["TRAIN"]["MAX_EPOCH"],
7878
accumulate_grad_batches=cfg["TRAIN"]["ACC_BATCHES"],

setup.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,11 @@
88
description="Receding Moving Object Segmentation in 3D LiDAR Data Using Sparse 4D Convolutions",
99
packages=find_packages(where="src"),
1010
install_requires=[
11-
"Click>=7.0",
12-
"numpy>=1.20.3",
13-
"pytorch_lightning>=1.6.4",
14-
"PyYAML>=6.0",
15-
"tqdm>=4.62.3",
16-
"torch",
17-
"ninja",
11+
"Click",
12+
"numpy",
13+
"pytorch_lightning",
14+
"tensorboard",
15+
"PyYAML",
16+
"tqdm",
1817
],
1918
)

src/mos4d/datasets/datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,6 @@ def __getitem__(self, idx):
212212
past_files = self.filenames[seq][from_idx : to_idx : self.skip]
213213
list_past_point_clouds = [self.read_point_cloud(f) for f in past_files]
214214
for i, pcd in enumerate(list_past_point_clouds):
215-
216215
# Transform to current viewpoint
217216
if self.transform:
218217
from_pose = self.poses[seq][past_indices[i]]

src/mos4d/models/MinkowskiEngine/resnet.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ def __init__(self, in_channels, out_channels, D=3):
5555
self.weight_initialization()
5656

5757
def network_initialization(self, in_channels, out_channels, D):
58-
5958
self.inplanes = self.INIT_DIM
6059
self.conv1 = nn.Sequential(
6160
ME.MinkowskiConvolution(

src/mos4d/models/metrics.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def __init__(self, n_classes, ignore_index):
1515
self.ignore_index = ignore_index
1616

1717
def compute_confusion_matrix(self, pred_logits: torch.Tensor, gt_labels: torch.Tensor):
18-
1918
# Set ignored classes to -inf to not influence softmax
2019
pred_logits[:, self.ignore_index] = -float("inf")
2120

src/mos4d/models/models.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def __init__(self, hparams: dict):
4747

4848
self.ClassificationMetrics = ClassificationMetrics(self.n_classes, self.ignore_index)
4949

50+
self.training_step_outputs = []
51+
self.validation_step_outputs = []
52+
5053
def getLoss(self, out: ME.TensorField, past_labels: list):
5154
loss = self.MOSLoss.compute_loss(out, past_labels)
5255
return loss
@@ -70,20 +73,20 @@ def training_step(self, batch: tuple, batch_idx, dataloader_index=0):
7073
self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
7174
)
7275

76+
self.training_step_outputs.append(dict_confusion_matrix)
7377
torch.cuda.empty_cache()
74-
return {"loss": loss, "dict_confusion_matrix": dict_confusion_matrix}
7578

76-
def training_epoch_end(self, training_step_outputs):
77-
list_dict_confusion_matrix = [
78-
output["dict_confusion_matrix"] for output in training_step_outputs
79-
]
79+
return loss
80+
81+
def on_train_epoch_end(self):
8082
for s in range(self.n_past_steps):
8183
agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
82-
for dict_confusion_matrix in list_dict_confusion_matrix:
84+
for dict_confusion_matrix in self.training_step_outputs:
8385
agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
8486
iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
8587
self.log("train_moving_iou_step{}".format(s), iou[2].item())
8688

89+
self.training_step_outputs.clear()
8790
torch.cuda.empty_cache()
8891

8992
def validation_step(self, batch: tuple, batch_idx):
@@ -101,17 +104,18 @@ def validation_step(self, batch: tuple, batch_idx):
101104
self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
102105
)
103106

107+
self.validation_step_outputs.append(dict_confusion_matrix)
104108
torch.cuda.empty_cache()
105-
return dict_confusion_matrix
106109

107-
def validation_epoch_end(self, validation_step_outputs):
110+
def on_validation_epoch_end(self):
108111
for s in range(self.n_past_steps):
109112
agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
110-
for dict_confusion_matrix in validation_step_outputs:
113+
for dict_confusion_matrix in self.validation_step_outputs:
111114
agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
112115
iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
113116
self.log("val_moving_iou_step{}".format(s), iou[2].item())
114117

118+
self.validation_step_outputs.clear()
115119
torch.cuda.empty_cache()
116120

117121
def predict_step(self, batch: tuple, batch_idx: int, dataloader_idx: int = None):
@@ -163,8 +167,8 @@ def get_step_confusion_matrix(self, out, past_labels, step):
163167
t = round(-step * self.dt_prediction, 3)
164168
mask = out.coordinates[:, -1].isclose(torch.tensor(t))
165169
pred_logits = out.features[mask].detach().cpu()
166-
gt_labels = torch.cat(past_labels, dim=0).detach().cpu()
167-
gt_labels = gt_labels[mask][:, 0]
170+
gt_labels = torch.cat(past_labels, dim=0)
171+
gt_labels = gt_labels[mask][:, 0].detach().cpu()
168172
confusion_matrix = self.ClassificationMetrics.compute_confusion_matrix(
169173
pred_logits, gt_labels
170174
)

0 commit comments

Comments
 (0)