I was running the global training on 4 GPUs and 48 CPUs.
The num of workers parameter (--data.num_workers) works for the value 1.
However, the training failed when I tried increasing the num of workers for efficient data loading.
I tried values, 48, 24, 12, 8, but all failed with the following error:
Traceback (most recent call last):
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 90, in
launch
return function(*args, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 621, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run
results = self._run_stage()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1137, in _run_stage
self._run_train()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1150, in _run_train
self._run_sanity_check()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1224, in _run_sanity_check
self._call_callback_hooks("on_sanity_check_end")
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1340, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/callbacks/progress/rich_progress.py", line 358, in
on_sanity_check_end
assert self.val_sanity_progress_bar_id is not None
AssertionError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/g/data/wb00/admin/staging/ClimaX/src/climax/global_forecast/train.py", line 41, in <module>
main()
File "/g/data/wb00/admin/staging/ClimaX/src/climax/global_forecast/train.py", line 34, in main
cli.trainer.fit(cli.model, datamodule=cli.datamodule)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in fit
call._call_and_handle_interrupt(
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 59, in _call_and_handle_interrupt
trainer.strategy.reconciliate_processes(traceback.format_exc())
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 461, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
Traceback (most recent call last):
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 90, in
launch
return function(*args, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 621, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run
results = self._run_stage()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1137, in _run_stage
self._run_train()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1150, in _run_train
self._run_sanity_check()
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1224, in _run_sanity_check
self._call_callback_hooks("on_sanity_check_end")
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1340, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/dir/proj/terry/env/climax/lib/python3.8/site-packages/pytorch_lightning/callbacks/progress/rich_progress.py", line 358, in
on_sanity_check_end
assert self.val_sanity_progress_bar_id is not None
AssertionError
At this moment the train works with only one worker, otherwise you get the above error.
Any suggestion, how to solve this?