yyzharry / imbalanced-regression Goto Github PK
View Code? Open in Web Editor NEW[ICML 2021, Long Talk] Delving into Deep Imbalanced Regression
Home Page: http://dir.csail.mit.edu
License: MIT License
[ICML 2021, Long Talk] Delving into Deep Imbalanced Regression
Home Page: http://dir.csail.mit.edu
License: MIT License
Hi !
Thank you for sharing your work on LDS and FDS. It realy inspired me in my work.
I'm trying to predict price (normalized with Z-score) given features about restaurant and delivery (time, location, distance , etc.). My data is imbalanced and I want my model to perform good whatever the price value is.
To help me understand your work, I re-implemented LDS and FDS (based on your advice). I can see LDS improve my model performance but FDS did worse (see graph below).
Moreover, when I look how statistics are smoothed, I can see that LDS smooths better than FDS (see graph below).
Did you ever experienced cases/data where FDS performs worse than vanilla ?
Thank you in advance for your help !
My code:
# imports
import time
import numpy as np
import torch
import torch.nn.functional as F
from pandas import DataFrame, Series
from scipy.ndimage import convolve1d, gaussian_filter1d
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
# classes
class FDSLayer(nn.Module):
"""
"""
def __init__(self, input_dim, n_bins: int = 100, kernel_size: int = 5, alpha: float = .9, start_smooth: int = 1):
super(FDSLayer, self).__init__()
self.register_buffer('mean', torch.zeros((n_bins, input_dim)))
self.register_buffer('var', torch.ones((n_bins, input_dim)))
self.register_buffer('smoothed_mean', torch.zeros((n_bins, input_dim)))
self.register_buffer('smoothed_var', torch.ones((n_bins, input_dim)))
self.input_dim = input_dim
self.alpha = alpha
self.n_bins = n_bins
self.kernel_size = kernel_size
self.half_kernel_size = (kernel_size - 1) // 2
self.sigma = 2
self.start_smooth = start_smooth
def smooth(self, inputs, labels, epoch):
if epoch < self.start_smooth:
return inputs
else:
labels = labels.squeeze()
bin_indexes = self._get_bin_indexes(labels)
factor = torch.clamp(torch.sqrt(self.smoothed_var / self.var), .1, 10)
return (inputs - self.mean[bin_indexes]) * factor[bin_indexes] + self.smoothed_mean[bin_indexes]
def _get_bin_indexes(self, labels):
_, bin_edges = torch.histogram(labels, self.n_bins, density=True)
return torch.bucketize(labels, bin_edges[1:-1])
def _get_kernel_window(self):
base_kernel = [0.] * self.half_kernel_size + [1.] + [0.] * self.half_kernel_size
kernel_window = gaussian_filter1d(base_kernel, sigma=self.sigma)
kernel_window = kernel_window / sum(kernel_window)
return torch.FloatTensor(kernel_window)
def update_running_stats(self, features, labels):
labels = labels.squeeze()
bin_indexes = self._get_bin_indexes(labels)
new_mean = torch.zeros((self.n_bins, self.input_dim))
new_var = torch.ones((self.n_bins, self.input_dim))
for b in torch.unique(bin_indexes):
a = features[bin_indexes == b]
if features.size() != 0:
new_mean[b] = torch.mean(a, dim=0)
new_var[b] = torch.var(a, dim=0, unbiased=True if a.size(0) != 1 else False)
self.mean = self.alpha * self.mean + (1 - self.alpha) * new_mean
self.var = self.alpha * self.var + (1 - self.alpha) * new_var
fds_kernel_window = self._get_kernel_window()
smoothed_mean = F.conv1d(F.pad(self.mean.view(1, self.n_bins, self.input_dim).permute(2, 0, 1),
pad=(self.half_kernel_size, self.half_kernel_size),
mode='reflect'),
weight=fds_kernel_window.view(1, 1, -1),
padding=0).permute(2, 0, 1).squeeze()
self.smoothed_mean = smoothed_mean
smoothed_var = F.conv1d(F.pad(self.var.view(1, self.n_bins, self.input_dim).permute(2, 0, 1),
pad=(self.half_kernel_size, self.half_kernel_size),
mode='reflect'),
weight=fds_kernel_window.view(1, 1, -1),
padding=0).permute(2, 0, 1).squeeze()
self.smoothed_var = smoothed_var
class MLPNetwork(nn.Module):
def __init__(self, input_dim: int, hidden_units: tuple = (128, 128, 128), lds: bool = False, fds: bool = False,
n_bins: int = 20, kernel_size: int = 5, alpha: float = .9, start_smooth: int = 1):
super(MLPNetwork, self).__init__()
self.lds = lds
self.fds = fds
self.n_bins = n_bins
self.kernel_size = kernel_size
self.half_kernel_size = kernel_size // 2
self.sigma = 2
input_layer = nn.Linear(input_dim, hidden_units[0])
self.layers = nn.ModuleList(
[input_layer] +
[nn.Linear(hidden_units[i - 1], hidden_units[i]) for i in range(1, len(hidden_units))]
)
if self.fds:
self.fds_layer = FDSLayer(hidden_units[-1], n_bins=n_bins, kernel_size=kernel_size, alpha=alpha,
start_smooth=start_smooth)
self.output_layer = nn.Linear(hidden_units[-1], 1)
def forward(self, inputs, labels=None, epoch=None):
x = inputs
for layer in self.layers:
x = torch.relu(layer(x))
smoothed_features = x.view(x.size(0), -1)
smoothed_features_ = smoothed_features
if self.training and self.fds:
smoothed_features_ = self.fds_layer.smooth(smoothed_features_, labels, epoch)
x = self.output_layer(smoothed_features_)
return x, smoothed_features
def fit(self, inputs, labels, val_inputs, val_labels, epochs: int = 200, batch_size: int = 1024):
if isinstance(inputs, DataFrame):
inputs = inputs.values
if isinstance(val_inputs, DataFrame):
val_inputs = val_inputs.values
if isinstance(labels, Series):
labels = labels.values
if isinstance(val_labels, Series):
val_labels = val_labels.values
# Create train dataloader
inputs = torch.FloatTensor(inputs)
labels = torch.FloatTensor(labels)
if self.lds:
empirical_label_distribution, bin_edges = torch.histogram(labels, bins=self.n_bins, density=True)
lds_kernel_window = self._get_kernel_window()
effective_label_distribution = F.conv1d(F.pad(empirical_label_distribution.view(1, 1, -1),
pad=(self.half_kernel_size, self.half_kernel_size),
mode='reflect'),
weight=lds_kernel_window.view(1, 1, -1),
padding=0).squeeze()
weights = 1 / effective_label_distribution[torch.bucketize(labels, bin_edges[1:-1])]
else:
weights = torch.ones(labels.size())
train_dataset = TensorDataset(inputs, labels.view(-1, 1), weights)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4)
# Create validation dataloader
val_inputs = torch.FloatTensor(val_inputs)
val_labels = torch.FloatTensor(val_labels).view(-1, 1)
val_dataset = TensorDataset(val_inputs, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=True, num_workers=4)
loss_fn = self.weighted_mae_loss
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
# Train
for epoch in range(epochs):
print(f'Epoch {epoch + 1}/{epochs}')
t0 = time.time()
train_loss = 0.
self.train(True)
if self.fds:
latent_feature_record, label_record = [], []
for i, (train_inputs, train_labels, train_weights) in enumerate(train_dataloader, 1):
optimizer.zero_grad(set_to_none=True)
if self.fds:
predictions, feat = self(train_inputs, train_labels, epoch)
latent_feature_record.extend(feat.data.squeeze().numpy())
label_record.extend(train_labels.data.squeeze().numpy())
else:
predictions, _ = self(train_inputs, train_labels)
loss = loss_fn(predictions, train_labels, train_weights)
loss.backward()
optimizer.step()
train_loss += loss.item()
self.train(False)
train_loss = train_loss / i
val_loss = 0.
for j, (val_inputs, val_labels) in enumerate(val_dataloader, 1):
with torch.no_grad():
predictions = self(val_inputs, val_labels, epoch)[0]
loss = loss_fn(predictions, val_labels)
val_loss += loss
val_loss = val_loss / j
if self.fds:
latent_features = torch.from_numpy(np.vstack(latent_feature_record))
labels_ = torch.from_numpy(np.hstack(label_record))
self.fds_layer.update_running_stats(latent_features, labels_)
print(f'time: {time.time() - t0:.1f}s - loss: {train_loss:.4f} - val_loss: {val_loss:.4f}\n')
def _get_kernel_window(self):
base_kernel = [0.] * self.half_kernel_size + [1.] + [0.] * self.half_kernel_size
kernel_window = gaussian_filter1d(base_kernel, sigma=self.sigma)
kernel_window = kernel_window / sum(kernel_window)
return torch.FloatTensor(kernel_window)
def predict(self, inputs):
if isinstance(inputs, DataFrame):
inputs = inputs.values
inputs = torch.FloatTensor(inputs)
return self(inputs)
@staticmethod
def weighted_mae_loss(predictions, targets, weights=None):
loss = torch.abs(predictions - targets).squeeze()
if weights is not None:
loss *= weights.squeeze().expand_as(loss)
loss = torch.mean(loss)
return loss
老师您好,我想将FDS、LDS应用到自己的数据上,请问FDS、LDS示例里的kernel、 ks、sigma等参数需要根据什么情况去设定呢?
Hi @YyzHarry ,The reproduced benchmark and model seem to be damaged。I use this link( https://drive.google.com/file/d/1CPDlcRCQ1EC4E3x9w955cmILSaVOlkyz/view )The downloaded model cannot be opened. It indicates that the data has been damaged. Can you update the model? Thank you!
I have the honor to read your paper, which is very solid. Now I am doing a task. label is a 256 * 256 picture, which needs to predict each pixel value of it (unbalanced continuous value between 0 and 1). Now I want to apply your LDS strategy, but my task is slightly different from yours. Although my data set is only 1W pieces, if each pixel is regarded as a label, the label will be very large(256 * 256 *10000). Do you think your method is still applicable?
I did an experiment where I calculated the value distribution for each label(256 * 256 images) separately and then applied the LDS strategy. It didn't work out too well.
Hello, I have some questions about the error pdf. Can I know how to get the right error pdf? Each labels have different numbers of samples, so should I apply a mean method for each label error or make them have the same amount?
Hi,
Amazing for your great job in the imblanced regression problem. But I notice that this work discusses more on the 1D regression problems. What if the output is more than 1D (like Batch_Size x 10) ?
Any suggestion will be helpful.
Thanks.
Hi, I would like to test your solution for a dataset of imbalanced distribution that I have. Do you think that you could make it available?
Hi, I'm wondering whether I can use this method as a preprocessing step for non-DNN models, namely simple linear regression or elastic net regression? If so, how should I adopt this method?
Thank you so much!
Hi,
Could you help clarify your implementation for SMOGN and RRT in the paper?
Thanks for clarifying!
Hi! This work is really fantastic! However, I found it hard to apply LDS/FDS to classic machine learning models like random forest. For example, after getting the effective label density with LDS, how should I use this?
Hi authors,
page 6 from your paper:
Precisely, Focal-R loss based on L1 distance can be written as 1/n∑n i=1 σ(|βei|)γ ei, where ei is the L1 error for i-th sample, σ(·)
imbalanced-regression/agedb-dir/loss.py
Line 24 in 055a7b3
2*torch.abs(...)-1
? you do not have and -1 or 2* in the function in your paper?Hi @YyzHarry,
I am trying to adapt the example from here https://github.com/YyzHarry/imbalanced-regression/tree/main/agedb-dir with my custom model and data. Thus, I would like to ask you whether this would be feasible and if yes if there are any example showing explicitly how to do that.
Thanks.
Excuse me.
Is it convenient to provide the SHHS data and its preprocessing scripts?
Thank you advance!
hi,SHHS-DIR 的数据需要授权,不便上传。论文中基于该数据集的相关代码方便上传下吗?
谢谢
In loss.py, "weights" is the last parameter in function of weighted_huber_loss or weighted_focal
but in train.py,
loss = globals()[f"weighted_{args.loss}_loss"](outputs, targets, weights)
no atrribute name for weights
maybe it should be
loss = globals()[f"weighted_{args.loss}_loss"](outputs, targets, weights = weights)
or it would make mistakes when using huber + LDS or focal + LDS
Hi Team,
I liked the ideas in your paper, but from reading the paper and provided code it sounds like the provided FDS and LDS code can be applied to any dataset/model? Is it really true?
imbalanced-regression/agedb-dir/fds.py
Line 120 in 055a7b3
imbalanced-regression/agedb-dir/utils.py
Line 102 in 055a7b3
Note:
I like the ideas in the paper, but due to lack of documentation/explanation I am right now spending a lot of time on generalizing the code and trying to figure out why you made some of the operations(eg. clippings)
Hi, thanks for the awesome job! I have a question about how FDS behaves during test time. It seems like that feature smoothing is disabled during the test since FDS module is only called during training for agedb-dir as follows:
if self.training and self.fds:
if epoch >= self.start_smooth:
encoding_s = self.FDS.smooth(encoding_s, targets, epoch)
Could you explain why you don't do feature smooth during testing?
Thanks a lot for your contribution, your works are really awesome.
I am very interested in your work. However, during the code reading, I did not find the SHHS-DIR dataset.
Could you publish the SHHS-DIR dataset or its sampling method, thank you!
In the papaer, is defined as the covariance int target bin. But in the code "utils.py", " fds.py", the variable used is variance not the covariance.
def calibrate_mean_var(matrix, m1, v1, m2, v2, clip_min=0.1, clip_max=10):
if torch.sum(v1) < 1e-10:
return matrix
if (v1 == 0.).any():
valid = (v1 != 0.)
factor = torch.clamp(v2[valid] / v1[valid], clip_min, clip_max)
matrix[:, valid] = (matrix[:, valid] - m1[valid]) * torch.sqrt(factor) + m2[valid]
return matrix
factor = torch.clamp(v2 / v1, clip_min, clip_max)
return (matrix - m1) * torch.sqrt(factor) + m2
def update_running_stats(self, features, labels, epoch):
if epoch < self.epoch:
return
assert self.feature_dim == features.size(1), "Input feature dimension is not aligned!"
assert features.size(0) == labels.size(0), "Dimensions of features and labels are not aligned!"
for label in torch.unique(labels):
if label > self.bucket_num - 1 or label < self.bucket_start:
continue
elif label == self.bucket_start:
curr_feats = features[labels <= label]
elif label == self.bucket_num - 1:
curr_feats = features[labels >= label]
else:
curr_feats = features[labels == label]
curr_num_sample = curr_feats.size(0)
curr_mean = torch.mean(curr_feats, 0)
curr_var = torch.var(curr_feats, 0, unbiased=True if curr_feats.size(0) != 1 else False)
self.num_samples_tracked[int(label - self.bucket_start)] += curr_num_sample
factor = self.momentum if self.momentum is not None else \
(1 - curr_num_sample / float(self.num_samples_tracked[int(label - self.bucket_start)]))
factor = 0 if epoch == self.start_update else factor
self.running_mean[int(label - self.bucket_start)] = \
(1 - factor) * curr_mean + factor * self.running_mean[int(label - self.bucket_start)]
self.running_var[int(label - self.bucket_start)] = \
(1 - factor) * curr_var + factor * self.running_var[int(label - self.bucket_start)]
And in the paper "Return of frustratingly easy domain adaptation" ,in the whitening and recoloring procedure, the covariance is used. Which one should i use in feature statistics calibration?
Or , maybe i got it wrong. The variable v1, v2 in "calibrate_mean_var" function is the covariance of target bin?
Hi, I have a question for the pseudo code in Supplementary A section.
For the Algorithm 1, the LDS is only used to compute the weights for loss inverse re-weighting scheme. Why not use the smoothed labels to train the models if LDS captures the real imbalance that affects regression problems?
In addition, could you provide code for computing the effective label density distribution?
Thank you.
Hi @YyzHarry,
I hope this message finds you well. I am reaching out to you regarding your paper, specifically about the discussion on interpolation and extrapolation of sample labels. While reviewing the code, I couldn't understand whether this functionality is implemented as a separate module or if the same LDS and FDS symmetric kernel is utilized for this purpose as well.
Could you kindly provide clarification or guidance on this matter? Many thanks.
老师您好,我想将FDS、LDS应用到自己的数据上,请问FDS、LDS示例里的kernel、 ks、sigma等参数需要根据什么情况去设定呢?
According to the paper, there are a lot of details in the appendix. But I can't find it on Google. Can you do me a favor? tks~
hello, I read your paper interestingly and want to ask you a question about the prediction result processing. I would like to ask how to limit the last prediction y^ to be between 0 and 99, or to get it directly from the regression function without any processing?
Hi, I run inference with your checkpoint but all my outputs are negative ?
Y said that You hadn't used any preprocess with labels right ?
Hi, congratulations for your ICML paper, it sounds very useful and I loved the insight of Figure 2. I am trying to implement the paper right now in one of my projects. I have a couple questions regarding LDS if you don't mind me asking here.
First, I am a bit puzzled at this line in your code:
imbalanced-regression/agedb-dir/utils.py
Line 115 in b7fa502
If I understand correctly, you are using gaussian_filter1d
to create a gaussian kernel of a small size (e.g. 5 in the paper) and then you convolve this with the label distribution using convolve1d
. But isn't gaussian_filter1d
supposed to do this (with the full window as the kernel) in the first place? Looking on the Internet I find that the reason why people use small gaussian kernels in e.g. image processing is usually computational : after a width of about 3 standard deviation, a larger kernel would be useless. However, in the paper, it appears that you actually get better results with small kernels? Could you elaborate on this a little bit please?
My second question is about this line:
In the paper, I found a place where you talk about reweighting the loss by something proportional to the inverse of the smoothed label distribution (Algorithm 1 in the appendix), but nothing about this reweighting by the inverse sqrt as you seem to be doing here by default. Could you also elaborate a bit on this, please?
Thank you for your time!
Would you please tell me how to choose the best bucket_num and bucket_start parameters?The range of labels in my dataset is (-2, 16)
In equation 6 of the paper (calculating the smoothed z), what is the subscript b referring to? From my understanding, it is the target bucket for the specific example from which z was calculated. If so, how is this implemented at test time? Wouldn't we require knowledge of the bin to which the example belongs?
It appears that the FDS layer accesses the label too:
imbalanced-regression/agedb-dir/fds.py
Line 119 in dc7ae76
Feels like I'm misunderstanding something
@YyzHarry Hello, I would like to ask, how did you implement the SMOTER and SMOGN algorithms in the paper? Are there parameters in the code to call them?
Hi,
Thank you for your great work.
A little suggestion, probably you can move the data transforms to the init of the dataset class instead of invoking the get_transform each time get data.
I think this would be better to improve the speed of preparing the data and hope this is helpful for you.
Thank you.
Ningtao
School of AI, Xidian University, China
Robarts Research Institute, Western University, Canada
when update statistics in FDS, do we need to calculate the mean and variance of all sample in each bin after every epoch finishes? do we need to update the sum of z and z^2 after each mini-batch?
I was wondering if you have suggestion on how to do something like stratified K-fold cross-validation in sklearn for classification task for categorical data but here for regression task and for continuous target variable?
Unfortunately, sklearn doesn't have such an option.
Hello, in your paper on the problem of deep imbalance regression, I have the privilege of learning about the smoothing methods of LDS and FDS. In one of my machine learning projects predicting convective cloud precipitation, I wanted to use FDS to play a role in it because of the imbalance between non-precipitation samples and precipitation samples. I wonder how to smooth the feature statistic without knowing its label in the test set, in my data, my data is very unbalanced (70% of the data without precipitation), which results in the characteristic statistics of each label interval are particularly similar (about 98%), so that the smoothing effect is still not significant, and if there are some important points to pay attention to if using FDS in machine learning?
Hi, thanks for sharing this wonderful project! There's a small question I wanna ask for. I got the following error when applying the FDS module:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64]], which is output 0 of SelectBackward, is at version 7; expected version 3 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
When I add feature.detach(), the error was gone, but if it is correct to do so? From my understanding, this module is updating feature with previous means and vairances, does it affect the BP part? Thanks in advance for any help!
Hi @YyzHarry,
I want to use your code of LDS to solve my problem,now I have a question to ask you:whether the input data format of LDS must be csv?Is npz format data OK?can it be applied to high-dimensional data? I would appreciate it if you could give me some guidance.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.