I am wondering if my different results are due to the implementation in nflows or because I missed some architectural details. Besides helping me to reproduce the results, I think it would be beneficial to extend the example section of this repository by some benchmark computations like these. I am willing to help you with this task.
In the following, I present a minimal example that reproduces my results (Running this 3 times, I get final test losses of 17.943886, 18.005745, 17.997137):
import torch
import numpy as np
from datasets import hepmass # I have not included this script here, compare with https://github.com/gpapamak/maf
from torch import optim
from torch.utils.data import DataLoader
from torch.nn import functional as F
import nflows
from nflows.flows import Flow
from nflows.flows import MaskedAutoregressiveFlow
if torch.cuda.is_available():
torch.device("cuda")
device = "cuda"
else:
torch.device("cpu")
device = "cpu"
num_layers = 5
num_hiddenfeatures1 = 512
num_hiddenfeatures2 = 512
num_blocks1 = 1
num_blocks2 = 2
activation = F.relu
lr = 1e-4 # learning rate
lr_wd = 1e-6 # weight decay
batch_size = 100
num_epochs = 400 # set some upper limit for the number of epochs
num_patience = 30
data_train, data_val, data_test = hepmass.load_data_no_discrete_normalised_as_array(
"data/hepmass")
D = 21
# batch norm after every layer
flow1 = MaskedAutoregressiveFlow(features=D, hidden_features=num_hiddenfeatures1, num_layers=num_layers, num_blocks_per_layer=num_blocks1, use_residual_blocks=False, batch_norm_between_layers=True).to(device)
flow2 = MaskedAutoregressiveFlow(features=D, hidden_features=num_hiddenfeatures2, num_layers=num_layers, num_blocks_per_layer=num_blocks2, use_residual_blocks=False, batch_norm_between_layers=True).to(device)
optimizer1 = optim.Adam(flow1.parameters(), lr=lr, weight_decay=lr_wd)
optimizer2 = optim.Adam(flow2.parameters(), lr=lr, weight_decay=lr_wd)
train_dataloader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(data_test, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(data_val, batch_size=batch_size, shuffle=True)
# lists that are filled with epoch-averages:
loss_val_list1 = []
loss_trn_list1 = []
loss_val_list2 = []
loss_trn_list2 = []
# define early stopping counters
counter_es1 = 0
counter_es2 = 0
# set best validation loss to infinity before training and update it during training
best_val_loss1 = np.inf
best_val_loss2 = np.inf
# boolean: if early stopping criterion is attained, set to False and stop training
train_model1 = True
train_model2 = True
# training loop:
for e in range(num_epochs):
loss_train1_thisepoch = []
loss_train2_thisepoch = []
for batch in train_dataloader:
flow1.train()
flow2.train()
batch = batch.type(torch.float32).to(device)
optimizer1.zero_grad()
optimizer2.zero_grad()
if train_model1:
loss1 = -flow1.log_prob(inputs=batch).mean()
loss1.backward()
optimizer1.step()
loss_train1_thisepoch.append(loss1.cpu().detach().numpy())
if train_model2:
loss2 = -flow2.log_prob(inputs=batch).mean()
loss2.backward()
optimizer2.step()
loss_train2_thisepoch.append(loss2.cpu().detach().numpy())
# compute epoch averages:
if train_model1:
loss_train1 = np.around(np.mean(loss_train1_thisepoch), decimals=2)
loss_trn_list1.append(loss_train1)
if train_model2:
loss_train2 = np.around(np.mean(loss_train2_thisepoch), decimals=2)
loss_trn_list2.append(loss_train2)
# save validation losses of this epoch here:
loss_val1_thisepoch = []
loss_val2_thisepoch = []
flow1.eval()
flow2.eval()
for val_batch in val_dataloader:
val_batch = val_batch.type(torch.float32).to(device)
if train_model1:
loss_val1_thisepoch.append(np.around(torch.mean(-flow1.log_prob(val_batch)).cpu().detach().numpy(),decimals=2))
if train_model2:
loss_val2_thisepoch.append(np.around(torch.mean(-flow2.log_prob(val_batch)).cpu().detach().numpy(), decimals=2))
if train_model1:
loss_val_list1.append(np.mean(loss_val1_thisepoch)) # epoch average
if np.mean(loss_val1_thisepoch) > best_val_loss1:
if counter_es1 == num_patience - 1: # stop training
train_model1 = False
print(f'Early Stopping counter (Model 1) {counter_es1 + 1}/{num_patience}')
counter_es1 += 1
else:
counter_es1 = 0 # reset counter
best_val_loss1 = np.mean(loss_val1_thisepoch)
# save model
torch.save(flow1.state_dict(), "model1")
if train_model2:
loss_val_list2.append(np.mean(loss_val2_thisepoch)) # epoch average
if np.mean(loss_val2_thisepoch) > best_val_loss2:
if counter_es2 == num_patience - 1: # stop training
train_model2 = False
print(f'Early Stopping counter (Model 2) {counter_es2 + 1}/{num_patience}')
counter_es2 += 1
else:
counter_es2 = 0 # reset counter
best_val_loss2 = np.mean(loss_val2_thisepoch)
# save model
torch.save(flow2.state_dict(), "model2")
if (train_model2 == False) and (train_model1 == False):
print(f"Training finished after {e+1} Epochs!")
break
if train_model1:
print(
f'Epoch {e + 1}/{num_epochs} Model1: Train loss = {loss_train1}, Validation loss = {np.mean(loss_val1_thisepoch)}')
if train_model2:
print(
f'Epoch {e + 1}/{num_epochs} Model2: Train loss = {loss_train2}, Validation loss = {np.mean(loss_val2_thisepoch)}')
### load model with minimal validation loss:
if counter_es1 > 0:
flow1 = MaskedAutoregressiveFlow(features=D, hidden_features=num_hiddenfeatures2, num_layers=num_layers, num_blocks_per_layer=num_blocks1, use_residual_blocks=False, batch_norm_between_layers=True).to(device)
flow1.load_state_dict(torch.load("model1", map_location=device))
torch.save(flow1.state_dict(), "model1")
if counter_es2 > 0:
flow2 = MaskedAutoregressiveFlow(features=D, hidden_features=num_hiddenfeatures1, num_layers=num_layers, num_blocks_per_layer=num_blocks2, use_residual_blocks=False, batch_norm_between_layers=True).to(device)
flow2.load_state_dict(torch.load("model2", map_location=device))
torch.save(flow2.state_dict(), "model2")
print(f"Model 1 (num_blocks={1}, num_hidden={512}) final validation loss: {best_val_loss1}")
print(f"Model 2 (num_blocks={2}, num_hidden={512}) final validation loss: {best_val_loss2}")
if best_val_loss2 > best_val_loss1:
best_model = 1
else:
best_model = 2
# print test loss:
flow1.eval()
flow2.eval()
with torch.no_grad():
loss_test = []
for test_batch in test_dataloader:
test_batch = test_batch.type(torch.float32).to(device)
if best_model == 1:
loss = -flow1.log_prob(inputs=test_batch).mean()
else:
loss = -flow2.log_prob(inputs=test_batch).mean()
loss_test.append(loss.cpu().detach().numpy())
average_testloss = np.mean(loss_test)
print("vanilla: Final Test loss after {} Epochs: {}".format(e + 1, average_testloss))