Thanks for sharing your very effective training code and pretrained model.
In order to reproduce the results of the 'wespeaker', we are experimenting with resnet152/221/293 while changing the batch size and chunk size, but an error occurred.
The training GPU environment is A100 (GPU: 0~7, 80GB memory).
However, when batch size (=64) and num_frms (=600) are applied in resnet_lm.yaml for large-margine finetuning, a cuda memory error occurs as shown in the figure below.
Could you please comment on what caused the error?
[ INFO : 2023-05-09 08:51:31,879 ] - +----------+----------+----------+----------+----------+----------+
[ INFO : 2023-05-09 08:51:31,879 ] - | Epoch| Batch| Lr| Margin| Loss| Acc|
[ INFO : 2023-05-09 08:51:31,879 ] - +----------+----------+----------+----------+----------+----------+
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 4; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 5; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 2; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 3; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 0; 79.17 GiB total capacity; 76.47 GiB already allocated; 135.81 MiB free; 76.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 7; 79.17 GiB total capacity; 76.47 GiB already allocated; 135.81 MiB free; 76.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 6; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 238, in
fire.Fire(train)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data/work_speaker_recognition/wespeaker/examples/voxceleb/v2/wespeaker/bin/train.py", line 210, in train
run_epoch(train_dataloader,
File "/data/work_speaker_recognition/wespeaker/wespeaker/utils/executor.py", line 61, in run_epoch
outputs = model(features) # (embed_a,embed_b) in most cases
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
return module_to_run(*inputs, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 179, in forward
out = self.layer4(out)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/work_speaker_recognition/wespeaker/wespeaker/models/resnet.py", line 106, in forward
out += self.shortcut(x)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 168, in forward
return F.batch_norm(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/nn/functional.py", line 2438, in batch_norm
return torch.batch_norm(
RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 1; 79.17 GiB total capacity; 76.47 GiB already allocated; 113.81 MiB free; 76.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 216843) of binary: /home/azureuser/miniconda3/envs/wespeaker/bin/python
Traceback (most recent call last):
File "/home/azureuser/miniconda3/envs/wespeaker/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.12.1', 'console_scripts', 'torchrun')())
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/distributed/run.py", line 761, in main
run(args)
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/distributed/run.py", line 752, in run
elastic_launch(
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/azureuser/miniconda3/envs/wespeaker/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: