I modified the scripts a little bit by setting the use_multiprocsessing as false, disabling the GPU, setting n_epochs as 1 and setting chkpt_cback to save the checkpoints only at the last epoch. I keep getting this warning:
Could someone explain what is going wrong here and how should I modify the script?
import os # modified
import os.path as op
import time
from keras_tqdm import TQDMCallback
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
from fastmri_recon.data.sequences.fastmri_sequences import Masked2DSequence, KIKISequence
from fastmri_recon.models.functional_models.kiki_sep import kiki_sep_net
from fastmri_recon.models.utils.data_consistency import MultiplyScalar
from fastmri_recon.models.utils.non_linearities import lrelu
train_path = 'C:/Users/yy263/Desktop/knee_singlecoil_train/singlecoil_train_ori/singlecoil_train/' # modified
val_path = 'C:/Users/yy263/Desktop/knee_singlecoil_train/singlecoil_val_ori/' # modified
test_path = 'C:/Users/yy263/Desktop/knee_singlecoil_train/singlecoil_test/' # modified
cuda_visible_devices = '-1' # modified
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices) # modified
n_samples_train = 34742
n_samples_val = 7135
n_volumes_train = 973
n_volumes_val = 199
AF = 4
train_gen_last = Masked2DSequence(train_path, af=AF, inner_slices=8, rand=True, scale_factor=1e6)
val_gen_last = Masked2DSequence(val_path, af=AF, scale_factor=1e6)
train_gen_i = KIKISequence(train_path, af=AF, inner_slices=8, rand=True, scale_factor=1e6, space='I')
val_gen_i = KIKISequence(val_path, af=AF, scale_factor=1e6, space='I')
train_gen_k = KIKISequence(train_path, af=AF, inner_slices=8, rand=True, scale_factor=1e6, space='K')
val_gen_k = KIKISequence(val_path, af=AF, scale_factor=1e6, space='K')
run_params = {
'n_convs': 25,
'n_filters': 48,
'noiseless': True,
'lr': 1e-3,
'activation': lrelu,
}
multiply_scalar = MultiplyScalar()
n_epochs = 1 # modified
def learning_rate_from_epoch(epoch):
return 10**(-(epoch // (n_epochs/3)) - 3)
def train_model(model, space='K', n=1):
print(model.summary(line_length=150))
run_id = f'kikinet_sep_{space}{n}_af{AF}_{int(time.time())}'
chkpt_path = f'checkpoints/{run_id}' + '-{epoch:02d}.hdf5'
print(run_id)
chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs) # modified
log_dir = op.join('logs', run_id)
tboard_cback = TensorBoard(
profile_batch=0,
log_dir=log_dir,
histogram_freq=0,
write_graph=True,
write_images=False,
)
lrate_cback = LearningRateScheduler(learning_rate_from_epoch)
tqdm_cb = TQDMCallback(metric_format="{name}: {value:e}")
tqdm_cb.on_train_batch_begin = tqdm_cb.on_batch_begin
tqdm_cb.on_train_batch_end = tqdm_cb.on_batch_end
if space == 'K':
train_gen = train_gen_k
val_gen = val_gen_k
elif space == 'I':
if n == 2:
train_gen = train_gen_last
val_gen = val_gen_last
elif n == 1:
train_gen = train_gen_i
val_gen = val_gen_i
model.fit_generator(
train_gen,
steps_per_epoch=n_volumes_train,
epochs=n_epochs,
validation_data=val_gen,
validation_steps=1,
verbose=0,
callbacks=[tqdm_cb, tboard_cback, chkpt_cback, lrate_cback,],
# max_queue_size=35,
use_multiprocessing=False, # modified
workers=35,
shuffle=True,
)
return model
# first K net training
model = kiki_sep_net(None, multiply_scalar, to_add='K', last=False, **run_params)
train_model(model, space='K', n=1)
model = kiki_sep_net(model, multiply_scalar, to_add='I', last=False, **run_params)
train_model(model, space='I', n=1)
model = kiki_sep_net(model, multiply_scalar, to_add='K', last=False, **run_params)
train_model(model, space='K', n=2)
model = kiki_sep_net(model, multiply_scalar, to_add='I', last=True, **run_params)
train_model(model, space='I', n=2)
2020-11-21 14:05:43.618232: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-11-21 14:05:45.301825: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library nvcuda.dll
2020-11-21 14:05:45.482548: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-21 14:05:45.490774: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: RON-MJ09HCC4
2020-11-21 14:05:45.494004: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: RON-MJ09HCC4
2020-11-21 14:05:45.496516: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2020-11-21 14:05:45.520179: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x198d47b6560 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-21 14:05:45.524038: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
Model: "model"
______________________________________________________________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
======================================================================================================================================================
kspace_input (InputLayer) [(None, 640, None, 1)] 0
______________________________________________________________________________________________________________________________________________________
lambda (Lambda) (None, 640, None, 1) 0 kspace_input[0][0]
______________________________________________________________________________________________________________________________________________________
lambda_1 (Lambda) (None, 640, None, 1) 0 kspace_input[0][0]
______________________________________________________________________________________________________________________________________________________
concatenate (Concatenate) (None, 640, None, 2) 0 lambda[0][0]
lambda_1[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d (Conv2D) (None, 640, None, 48) 912 concatenate[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_1 (Conv2D) (None, 640, None, 48) 20784 conv2d[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_2 (Conv2D) (None, 640, None, 48) 20784 conv2d_1[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_3 (Conv2D) (None, 640, None, 48) 20784 conv2d_2[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_4 (Conv2D) (None, 640, None, 48) 20784 conv2d_3[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_5 (Conv2D) (None, 640, None, 48) 20784 conv2d_4[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_6 (Conv2D) (None, 640, None, 48) 20784 conv2d_5[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_7 (Conv2D) (None, 640, None, 48) 20784 conv2d_6[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_8 (Conv2D) (None, 640, None, 48) 20784 conv2d_7[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_9 (Conv2D) (None, 640, None, 48) 20784 conv2d_8[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_10 (Conv2D) (None, 640, None, 48) 20784 conv2d_9[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_11 (Conv2D) (None, 640, None, 48) 20784 conv2d_10[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_12 (Conv2D) (None, 640, None, 48) 20784 conv2d_11[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_13 (Conv2D) (None, 640, None, 48) 20784 conv2d_12[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_14 (Conv2D) (None, 640, None, 48) 20784 conv2d_13[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_15 (Conv2D) (None, 640, None, 48) 20784 conv2d_14[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_16 (Conv2D) (None, 640, None, 48) 20784 conv2d_15[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_17 (Conv2D) (None, 640, None, 48) 20784 conv2d_16[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_18 (Conv2D) (None, 640, None, 48) 20784 conv2d_17[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_19 (Conv2D) (None, 640, None, 48) 20784 conv2d_18[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_20 (Conv2D) (None, 640, None, 48) 20784 conv2d_19[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_21 (Conv2D) (None, 640, None, 48) 20784 conv2d_20[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_22 (Conv2D) (None, 640, None, 48) 20784 conv2d_21[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_23 (Conv2D) (None, 640, None, 48) 20784 conv2d_22[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_24 (Conv2D) (None, 640, None, 48) 20784 conv2d_23[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_25 (Conv2D) (None, 640, None, 2) 98 conv2d_24[0][0]
______________________________________________________________________________________________________________________________________________________
mask_input (InputLayer) [(None, 640, None)] 0
______________________________________________________________________________________________________________________________________________________
lambda_2 (Lambda) (None, 640, None, 1) 0 conv2d_25[0][0]
======================================================================================================================================================
Total params: 499,826
Trainable params: 499,826
Non-trainable params: 0
______________________________________________________________________________________________________________________________________________________
None
kikinet_sep_K1_af4_1605985545
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
WARNING:tensorflow:From kikinet_sep_approach_af4_ori.py:94: Model.fit_generator (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
Please use Model.fit, which supports generators.
Training: 0%| | 0/50 [00:00<?, ?it/s]Traceback (most recent call last): | 0/973 [00:00<?, ?it/s]
File "kikinet_sep_approach_af4_ori.py", line 100, in <module>
train_model(model, space='K', n=1)
File "kikinet_sep_approach_af4_ori.py", line 94, in train_model
shuffle=False,
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\util\deprecation.py", line 324, in new_func
return func(*args, **kwargs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1479, in fit_generator
initial_epoch=initial_epoch)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 855, in fit
callbacks.on_train_batch_end(step, logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\callbacks.py", line 390, in on_train_batch_end
self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\callbacks.py", line 298, in _call_batch_hook
batch_hook(batch, logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\keras_tqdm\tqdm_callback.py", line 117, in on_batch_end
self.append_logs(logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\keras_tqdm\tqdm_callback.py", line 136, in append_logs
metrics = self.params['metrics']
KeyError: 'metrics'
2020-11-21 14:05:59.152617: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Failed precondition: Python interpreter state is not initialized. The process may be terminated.
[[{{node PyFunc}}]]
Training: 0%| | 0/50 [00:13<?, ?it/s]
Epoch: 0: 0%| | 0/973 [00:13<?, ?it/s]
(fastMRI) C:\Users\yy263\Desktop\fastmri-reproducible-benchmark-master\fastmri-reproducible-benchmark-master>python kikinet_sep_approach_af4_ori.py
2020-11-21 14:07:31.782620: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-11-21 14:07:33.440426: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library nvcuda.dll
2020-11-21 14:07:33.624847: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-21 14:07:33.635379: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: RON-MJ09HCC4
2020-11-21 14:07:33.645497: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: RON-MJ09HCC4
2020-11-21 14:07:33.651373: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2020-11-21 14:07:33.685297: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2532a87c4f0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-21 14:07:33.692565: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
Model: "model"
______________________________________________________________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
======================================================================================================================================================
kspace_input (InputLayer) [(None, 640, None, 1)] 0
______________________________________________________________________________________________________________________________________________________
lambda (Lambda) (None, 640, None, 1) 0 kspace_input[0][0]
______________________________________________________________________________________________________________________________________________________
lambda_1 (Lambda) (None, 640, None, 1) 0 kspace_input[0][0]
______________________________________________________________________________________________________________________________________________________
concatenate (Concatenate) (None, 640, None, 2) 0 lambda[0][0]
lambda_1[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d (Conv2D) (None, 640, None, 48) 912 concatenate[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_1 (Conv2D) (None, 640, None, 48) 20784 conv2d[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_2 (Conv2D) (None, 640, None, 48) 20784 conv2d_1[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_3 (Conv2D) (None, 640, None, 48) 20784 conv2d_2[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_4 (Conv2D) (None, 640, None, 48) 20784 conv2d_3[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_5 (Conv2D) (None, 640, None, 48) 20784 conv2d_4[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_6 (Conv2D) (None, 640, None, 48) 20784 conv2d_5[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_7 (Conv2D) (None, 640, None, 48) 20784 conv2d_6[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_8 (Conv2D) (None, 640, None, 48) 20784 conv2d_7[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_9 (Conv2D) (None, 640, None, 48) 20784 conv2d_8[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_10 (Conv2D) (None, 640, None, 48) 20784 conv2d_9[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_11 (Conv2D) (None, 640, None, 48) 20784 conv2d_10[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_12 (Conv2D) (None, 640, None, 48) 20784 conv2d_11[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_13 (Conv2D) (None, 640, None, 48) 20784 conv2d_12[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_14 (Conv2D) (None, 640, None, 48) 20784 conv2d_13[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_15 (Conv2D) (None, 640, None, 48) 20784 conv2d_14[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_16 (Conv2D) (None, 640, None, 48) 20784 conv2d_15[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_17 (Conv2D) (None, 640, None, 48) 20784 conv2d_16[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_18 (Conv2D) (None, 640, None, 48) 20784 conv2d_17[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_19 (Conv2D) (None, 640, None, 48) 20784 conv2d_18[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_20 (Conv2D) (None, 640, None, 48) 20784 conv2d_19[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_21 (Conv2D) (None, 640, None, 48) 20784 conv2d_20[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_22 (Conv2D) (None, 640, None, 48) 20784 conv2d_21[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_23 (Conv2D) (None, 640, None, 48) 20784 conv2d_22[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_24 (Conv2D) (None, 640, None, 48) 20784 conv2d_23[0][0]
______________________________________________________________________________________________________________________________________________________
conv2d_25 (Conv2D) (None, 640, None, 2) 98 conv2d_24[0][0]
______________________________________________________________________________________________________________________________________________________
mask_input (InputLayer) [(None, 640, None)] 0
______________________________________________________________________________________________________________________________________________________
lambda_2 (Lambda) (None, 640, None, 1) 0 conv2d_25[0][0]
======================================================================================================================================================
Total params: 499,826
Trainable params: 499,826
Non-trainable params: 0
______________________________________________________________________________________________________________________________________________________
None
kikinet_sep_K1_af4_1605985653
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
WARNING:tensorflow:From kikinet_sep_approach_af4_ori.py:94: Model.fit_generator (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
Please use Model.fit, which supports generators.
Training: 0%| | 0/50 [00:00<?, ?it/s]Traceback (most recent call last): | 0/973 [00:00<?, ?it/s]
File "kikinet_sep_approach_af4_ori.py", line 100, in <module>
train_model(model, space='K', n=1)
File "kikinet_sep_approach_af4_ori.py", line 94, in train_model
shuffle=True,
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\util\deprecation.py", line 324, in new_func
return func(*args, **kwargs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1479, in fit_generator
initial_epoch=initial_epoch)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\engine\training.py", line 855, in fit
callbacks.on_train_batch_end(step, logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\callbacks.py", line 390, in on_train_batch_end
self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\tensorflow\python\keras\callbacks.py", line 298, in _call_batch_hook
batch_hook(batch, logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\keras_tqdm\tqdm_callback.py", line 117, in on_batch_end
self.append_logs(logs)
File "C:\Users\yy263\Anaconda3\envs\fastMRI\lib\site-packages\keras_tqdm\tqdm_callback.py", line 136, in append_logs
metrics = self.params['metrics']
KeyError: 'metrics'
2020-11-21 14:07:47.026364: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Failed precondition: Python interpreter state is not initialized. The process may be terminated.
[[{{node PyFunc}}]]
Training: 0%| | 0/50 [00:12<?, ?it/s]
Epoch: 0: 0%| | 0/973 [00:12<?, ?it/s]