I am using an A100GPU to run the code, it seems to be incompatible with CUDA10.2, report the following error.
INFO 2024-09-09 20:39:10,546 state_update_hooks.py: 113: Starting phase 0 [train]
--- Logging error ---
Traceback (most recent call last):
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 1085, in emit
msg = self.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 929, in format
return fmt.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 668, in format
record.message = record.getMessage()
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 373, in getMessage
msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 162, in launch_distributed
logging.error("Wrapping up, caught exception: ", e)
Message: 'Wrapping up, caught exception: '
Arguments: (RuntimeError('CUDA error: no kernel image is available for execution on the device'),)
--- Logging error ---
Traceback (most recent call last):
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 1085, in emit
msg = self.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 929, in format
return fmt.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 668, in format
record.message = record.getMessage()
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 373, in getMessage
msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 162, in launch_distributed
logging.error("Wrapping up, caught exception: ", e)
Message: 'Wrapping up, caught exception: '
Arguments: (RuntimeError('CUDA error: no kernel image is available for execution on the device'),)
Traceback (most recent call last):
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 164, in launch_distributed
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
Can CUDA versions be 11 or higher?
I am using an A100GPU to run the code, it seems to be incompatible with CUDA10.2, report the following error.
INFO 2024-09-09 20:39:10,546 state_update_hooks.py: 113: Starting phase 0 [train]
--- Logging error ---
Traceback (most recent call last):
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 1085, in emit
msg = self.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 929, in format
return fmt.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 668, in format
record.message = record.getMessage()
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 373, in getMessage
msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 162, in launch_distributed
logging.error("Wrapping up, caught exception: ", e)
Message: 'Wrapping up, caught exception: '
Arguments: (RuntimeError('CUDA error: no kernel image is available for execution on the device'),)
--- Logging error ---
Traceback (most recent call last):
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 1085, in emit
msg = self.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 929, in format
return fmt.format(record)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 668, in format
record.message = record.getMessage()
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/logging/init.py", line 373, in getMessage
msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 162, in launch_distributed
logging.error("Wrapping up, caught exception: ", e)
Message: 'Wrapping up, caught exception: '
Arguments: (RuntimeError('CUDA error: no kernel image is available for execution on the device'),)
Traceback (most recent call last):
File "main.py", line 97, in
hydra_main(overrides=overrides, mode=training_mode)
File "main.py", line 59, in hydra_main
launch_distributed(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 164, in launch_distributed
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 150, in launch_distributed
_distributed_worker(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/utils/distributed_launcher.py", line 192, in _distributed_worker
run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/engine_registry.py", line 86, in run_engine
engine.run_engine(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 39, in run_engine
train_main(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/engines/train.py", line 130, in train_main
trainer.train()
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 201, in train
raise e
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/trainer_main.py", line 193, in train
task = train_step_fn(task)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/trainer/train_steps/custom_train_step_surgery.py", line 189, in custom_train_step_surgery
model_output = task.model(sample["input"])
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 97, in call
return self.forward(*args, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/ClassyVision/classy_vision/models/classy_model.py", line 111, in forward
out = self.classy_model(*args, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 179, in forward
return self.single_input_forward(batch, self._output_feature_names, self.heads)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/base_ssl_model.py", line 127, in single_input_forward
feats = self.trunk(batch, feature_names)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/trunks/resnext.py", line 184, in forward
out = get_trunk_forward_outputs(
File "/home/NVME-2/SelfSupSurg/ext_libs/vissl/vissl/models/model_helpers.py", line 463, in get_trunk_forward_outputs
feat = feature_block(feat)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm.py", line 85, in forward
return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
File "/home/chenghua/anaconda3/envs/ssl/lib/python3.8/site-packages/apex/parallel/optimized_sync_batchnorm_kernel.py", line 36, in forward
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill(count)
RuntimeError: CUDA error: no kernel image is available for execution on the device
Can CUDA versions be 11 or higher?