For the past week or two, I've been training in Google Colab using the experimental branch, and it's gone well. I do have to make a few changes to the code for it to function in Colab.
However, I tried to do some more training today, and I've ran into an error that I can't figure out. It was when I ran the training script with my own dataset, using the last checkpoint I had.
I'm training at a 44100 sampling rate, with hop size, window size, etc. adjusted accordingly. I had to adjust the n_speakers and decoder_rnn_dim, and turn off the second decoder, so that my old checkpoints would be compatible.
train(args, args.rank, args.group_name, hparams)
File "train.py", line 707, in train
y_pred = force(model, valid_kwargs=model_args, **{**y, "teacher_force_till": teacher_force_till, "p_teacher_forcing": p_teacher_forcing, "drop_frame_rate": drop_frame_rate})
File "/content/cookietts/CookieTTS/utils/_utils_.py", line 35, in force
return func(*args, **{k:v for k,v in kwargs.items() if k in valid_kwargs})
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/apex/amp/_initialize.py", line 197, in new_fwd
**applier(kwargs, input_caster))
File "/content/cookietts/CookieTTS/_2_ttm/tacotron2_tm/model.py", line 1012, in forward
return_hidden_state=return_hidden_state)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/content/cookietts/CookieTTS/_2_ttm/tacotron2_tm/model.py", line 835, in forward
mel_output, gate_output, attention_weights, decoder_hidden_attention_context = self.decode(decoder_input, memory_lengths)
File "/content/cookietts/CookieTTS/_2_ttm/tacotron2_tm/model.py", line 746, in decode
decoderrnn_state = self.decoder_rnn(decoder_input, (decoder_hidden, decoder_cell))# lstmcell 12.789ms
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/content/cookietts/CookieTTS/utils/model/layers.py", line 386, in forward
self.bias_ih, self.bias_hh,
RuntimeError: default_program(57): error: identifier "aten_sigmoid_flat__1" is undefined
default_program(58): error: no operator "=" matches these operands
operand types are: half = float
default_program(64): error: identifier "aten_mul_flat__1" is undefined
default_program(65): error: no operator "=" matches these operands
operand types are: half = float
4 errors detected in the compilation of "default_program".
nvrtc compilation failed:
#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)
template<typename T>
__device__ T maximum(T a, T b) {
return isnan(a) ? a : (a > b ? a : b);
}
template<typename T>
__device__ T minimum(T a, T b) {
return isnan(a) ? a : (a < b ? a : b);
}
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
#if defined(__cplusplus)
struct __align__(2) __half {
__host__ __device__ __half() { }
protected:
unsigned short __x;
};
/* All intrinsic functions are only available to nvcc compilers */
#if defined(__CUDACC__)
/* Definitions of intrinsics */
__device__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
return val;
}
__device__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
return val;
}
#endif /* defined(__CUDACC__) */
#endif /* defined(__cplusplus) */
#undef __HALF_TO_US
#undef __HALF_TO_CUS
typedef __half half;
extern "C" __global__
void func_3(half* t0, half* t1, half* aten_mul_flat, half* aten_sigmoid_flat, half* aten_mul_flat_1, half* aten_tanh_flat, half* aten_sigmoid_flat_1, half* prim_constantchunk_flat) {
{
float v = __half2float(t1[((512 * blockIdx.x + threadIdx.x) % 1280 + 4 * (((512 * blockIdx.x + threadIdx.x) / 1280) * 1280)) + 3840]);
prim_constantchunk_flat[512 * blockIdx.x + threadIdx.x] = __float2half(v);
float t1_ = __half2float(t1[((512 * blockIdx.x + threadIdx.x) % 1280 + 4 * (((512 * blockIdx.x + threadIdx.x) / 1280) * 1280)) + 1280]);
float aten_sigmoid_flat_ = __half2float(aten_sigmoid_flat[512 * blockIdx.x + threadIdx.x]);
aten_sigmoid_flat__1 = __float2half(1.f / (1.f + (expf(0.f - t1_))));
aten_sigmoid_flat[512 * blockIdx.x + threadIdx.x] = aten_sigmoid_flat_;
float t1__1 = __half2float(t1[((512 * blockIdx.x + threadIdx.x) % 1280 + 4 * (((512 * blockIdx.x + threadIdx.x) / 1280) * 1280)) + 2560]);
aten_tanh_flat[512 * blockIdx.x + threadIdx.x] = __float2half(tanhf(t1__1));
float t1__2 = __half2float(t1[(512 * blockIdx.x + threadIdx.x) % 1280 + 4 * (((512 * blockIdx.x + threadIdx.x) / 1280) * 1280)]);
aten_sigmoid_flat_1[512 * blockIdx.x + threadIdx.x] = __float2half(1.f / (1.f + (expf(0.f - t1__2))));
float aten_mul_flat_ = __half2float(aten_mul_flat[512 * blockIdx.x + threadIdx.x]);
aten_mul_flat__1 = __float2half((1.f / (1.f + (expf(0.f - t1_)))) * __half2float(t0[512 * blockIdx.x + threadIdx.x]));
aten_mul_flat[512 * blockIdx.x + threadIdx.x] = aten_mul_flat_;
aten_mul_flat_1[512 * blockIdx.x + threadIdx.x] = __float2half((1.f / (1.f + (expf(0.f - t1__2)))) * (tanhf(t1__1)));
}
}
Epoch:: 46% 456/1000 [00:12<00:14, 37.47epoch/s]
Iter: : 0% 0/67 [00:11<?, ?iter/s]
/content/cookietts/CookieTTS/utils/torchmoji/model_def.py:193: UserWarning: This overload of nonzero is deprecated:
nonzero()
Consider using one of the following signatures instead:
nonzero(*, bool as_tuple) (Triggered internally at /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
input_lengths = torch.LongTensor([torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0])])