Hi, my running also sucks at the beginning.... Could you tell me how to fix this bug?
Here is the output.
"distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
micl-libyws6:2583202:2583202 [2] NCCL INFO Bootstrap : Using [0]eth0:10.96.80.81<0> [1]enxb03af2b6059f:169.254.3.1<0>
micl-libyws6:2583202:2583202 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
micl-libyws6:2583202:2583202 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
micl-libyws6:2583202:2583202 [2] NCCL INFO NET/Socket : Using [0]eth0:10.96.80.81<0> [1]enxb03af2b6059f:169.254.3.1<0>
micl-libyws6:2583202:2583202 [2] NCCL INFO Using network Socket
NCCL version 2.7.8+cuda10.2
micl-libyws6:2583381:2583381 [3] NCCL INFO Bootstrap : Using [0]eth0:10.96.80.81<0> [1]enxb03af2b6059f:169.254.3.1<0>
micl-libyws6:2583381:2583381 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
micl-libyws6:2583381:2583381 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
micl-libyws6:2583381:2583381 [3] NCCL INFO NET/Socket : Using [0]eth0:10.96.80.81<0> [1]enxb03af2b6059f:169.254.3.1<0>
micl-libyws6:2583381:2583381 [3] NCCL INFO Using network Socket
micl-libyws6:2583202:2587764 [2] NCCL INFO Channel 00/02 : 0 1
micl-libyws6:2583381:2587765 [3] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 8/8/64
micl-libyws6:2583202:2587764 [2] NCCL INFO Channel 01/02 : 0 1
micl-libyws6:2583381:2587765 [3] NCCL INFO Trees [0] -1/-1/-1->1->0|0->1->-1/-1/-1 [1] -1/-1/-1->1->0|0->1->-1/-1/-1
micl-libyws6:2583202:2587764 [2] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 8/8/64
micl-libyws6:2583202:2587764 [2] NCCL INFO Trees [0] 1/-1/-1->0->-1|-1->0->1/-1/-1 [1] 1/-1/-1->0->-1|-1->0->1/-1/-1
micl-libyws6:2583381:2587765 [3] NCCL INFO Channel 00 : 1[43000] -> 0[41000] via P2P/IPC
micl-libyws6:2583202:2587764 [2] NCCL INFO Channel 00 : 0[41000] -> 1[43000] via P2P/IPC
micl-libyws6:2583381:2587765 [3] NCCL INFO Channel 01 : 1[43000] -> 0[41000] via P2P/IPC
micl-libyws6:2583202:2587764 [2] NCCL INFO Channel 01 : 0[41000] -> 1[43000] via P2P/IPC
micl-libyws6:2583381:2587765 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
micl-libyws6:2583381:2587765 [3] NCCL INFO comm 0x7f96640010d0 rank 1 nranks 2 cudaDev 3 busId 43000 - Init COMPLETE
micl-libyws6:2583202:2587764 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
micl-libyws6:2583202:2587764 [2] NCCL INFO comm 0x7fe9a00010d0 rank 0 nranks 2 cudaDev 2 busId 41000 - Init COMPLETE
micl-libyws6:2583202:2583202 [2] NCCL INFO Launch mode Parallel
micl-libyws6:2583381:2583381 [3] enqueue.cc:215 NCCL WARN Cuda failure 'invalid device function'
micl-libyws6:2583381:2583381 [3] NCCL INFO group.cc:282 -> 1
micl-libyws6:2583202:2583202 [2] enqueue.cc:215 NCCL WARN Cuda failure 'invalid device function'
micl-libyws6:2583202:2583202 [2] NCCL INFO group.cc:282 -> 1
Error executing job with overrides: []
Traceback (most recent call last):
File "/home/houyi/projects/ARLDM/main.py", line 489, in
main()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/home/houyi/projects/ARLDM/main.py", line 482, in main
train(args)
File "/home/houyi/projects/ARLDM/main.py", line 437, in train
trainer.fit(model, dataloader, ckpt_path=args.train_model_file)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in fit
self._call_and_handle_interrupt(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 723, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1172, in _run
self.__setup_profiler()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1797, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 2249, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 319, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1681, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1039, in broadcast
Error executing job with overrides: []
Traceback (most recent call last):
File "main.py", line 489, in
work = default_pg.broadcast([tensor], opts)
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1627336343171/work/torch/lib/c10d/ProcessGroupNCCL.cpp:33, unhandled cuda error, NCCL version 2.7.8
ncclUnhandledCudaError: Call to CUDA function failed.
main()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "main.py", line 482, in main
train(args)
File "main.py", line 437, in train
trainer.fit(model, dataloader, ckpt_path=args.train_model_file)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in fit
self._call_and_handle_interrupt(
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1172, in _run
self.__setup_profiler()
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1797, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 2249, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 319, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1681, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/home/houyi/anaconda3/envs/arldm2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1039, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1627336343171/work/torch/lib/c10d/ProcessGroupNCCL.cpp:33, unhandled cuda error, NCCL version 2.7.8
ncclUnhandledCudaError: Call to CUDA function failed."