Thanks for your great work. When I try to train the model with eight 3090 GPUs with the following commands,
./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
The following errors are encountered.
Starting training from iteration 0
Exception during training:
Traceback (most recent call last):
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/train_loop.py", line 149, in train
self.run_step()
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 297, in run_step
grad_norm = self.grad_scaler(
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 207, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/ldm/modules/diffusionmodules/util.py", line 142, in backward
input_grads = torch.autograd.grad(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 300, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autogr
[06/17 03:38:05 d2.engine.hooks]: Total training time: 0:00:25 (0:00:00 on hooks)
[06/17 03:38:05 d2.utils.events]: odise_label_coco_50e_bs16x8/default iter: 0/368752 lr: N/A max_mem: 19297M
Traceback (most recent call last):
File "/home/zoloz/8T-1/zitong/code/ODISE/./tools/train_net.py", line 392, in <module>
launch(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/launch.py", line 67, in launch
mp.spawn(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/launch.py", line 126, in _distributed_worker
main_func(*args)
File "/home/zoloz/8T-1/zitong/code/ODISE/tools/train_net.py", line 363, in main
do_train(args, cfg)
File "/home/zoloz/8T-1/zitong/code/ODISE/tools/train_net.py", line 309, in do_train
trainer.train(start_iter, cfg.train.max_iter)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/train_loop.py", line 149, in train
self.run_step()
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 297, in run_step
grad_norm = self.grad_scaler(
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 207, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/ldm/modules/diffusionmodules/util.py", line 142, in backward
input_grads = torch.autograd.grad(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 300, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 414, in wrapper
outputs = fn(ctx, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 111, in backward
grads = _memory_efficient_attention_backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 382, in _memory_efficient_attention_backward
grads = op.apply(ctx, inp, grad)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/cutlass.py", line 184, in apply
(grad_q, grad_k, grad_v,) = cls.OPERATOR(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_ops.py", line 442, in __call__
return self._op(*args, **kwargs or {})
RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.