I have a PyTorch “Embedding Hurdle” model that deals with a lot of categorical variables and predicting a continuous variable that can be zero in some instances (50% of training data).
In statistics, we would have two models: one for classification if the prediction is different from zero and one for making the prediction of the continuous variable. I am using fastai tabular pandas for handling tabular PyTorch logic.
I want to ask if anyone has experience with this approach and has any tips. Also, I noticed that the model takes a pretty slow time to train, despite having a small dataset. The code for the model is available on StackOverflow.
Example code and training loop:
# +
import pandas as pd
from fastai.tabular.all import *
def EndSplitter(valid_pct=0.2, valid_last=True):
"Create function that splits items between train/val with valid_pct at the end if valid_last else at the start. Useful for ordered data."
assert 0<valid_pct<1, "valid_pct must be in (0,1)"
def _inner(o):
idxs = range_of(o)
cut = int(valid_pct * len(o))
return (idxs[:-cut], idxs[-cut:]) if valid_last else (idxs[cut:],idxs[:cut])
return _inner
df = pd.read_csv("data_files.csv")
y_name = 'y'
cat_vars = ["day_of_week", "is_holiday", "is_day_before_holiday", "year", "month", "week_of_year", "cat1", "cat2","cat3", "cat4", "cat5"]
cont_vars = []
procs = [Normalize, Categorify, FillMissing]
# df["Y"] = df[y_name]
# MIN = df["Y"].min()
# MAX = df["Y"].max()
splits = EndSplitter(valid_pct=0.15)(range_of(df))
# +
to = TabularPandas(
df,
procs=procs,
cat_names=cat_vars,
cont_names=cont_vars,
y_names=[y_name],
splits=splits,
device='cuda',
y_block=RegressionBlock(n_out = 1)
)
dls = to.dataloaders(
bs=int(512 * 3), shuffle_train=True
) # pytorch data loader every batch around 2k data points
# -
class EmbeddingTabularModel(nn.Module):
def __init__(self, num_numerical_features, emb_szs, hidden_dims, dropout_prob=0.1, output_hidden_dims=[16, 8]):
super(EmbeddingTabularModel, self).__init__()
self.n_cont = num_numerical_features
self.embedding_layers = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
n_emb = sum(e.embedding_dim for e in self.embedding_layers)
self.n_emb= n_emb
# Linear layers for numerical variables
self.linear_layers = nn.Linear(n_emb, hidden_dims[0])
# Hidden layers
self.hidden_layers = nn.ModuleList([nn.Sequential(
nn.Linear(hidden_dims[i], hidden_dims[i+1]),
nn.BatchNorm1d(hidden_dims[i+1]),
nn.Dropout(p=dropout_prob),
nn.ReLU()
)
for i in range(len(hidden_dims)-1)])
# Output layers
self.regression_hidden_layers = nn.ModuleList([nn.Sequential(
nn.Linear(hidden_dims[-1], output_hidden_dims[0]),
nn.BatchNorm1d(output_hidden_dims[0]),
nn.Dropout(p=dropout_prob),
nn.ReLU()
),
nn.Sequential(
nn.Linear(output_hidden_dims[0], output_hidden_dims[1]),
nn.BatchNorm1d(output_hidden_dims[1]),
nn.Dropout(p=dropout_prob),
nn.ReLU()
)
])
self.regression_output = nn.Linear(output_hidden_dims[-1], 1)
self.classification_hidden_layers = nn.ModuleList([nn.Sequential(
nn.Linear(hidden_dims[-1], output_hidden_dims[0]),
nn.BatchNorm1d(output_hidden_dims[0]),
nn.Dropout(p=dropout_prob),
nn.ReLU()
),
nn.Sequential(
nn.Linear(output_hidden_dims[0], output_hidden_dims[1]),
nn.BatchNorm1d(output_hidden_dims[1]),
nn.Dropout(p=dropout_prob),
nn.ReLU()
)
])
self.classification_output = nn.Linear(output_hidden_dims[-1], 1)
def forward_regression(self, x_numerical, x_categorical):
# Embedding layer for categorical variables
x_categorical = [e(x_categorical[:,i]) for i,e in enumerate(self.embedding_layers)]
x = torch.cat(x_categorical, 1)
if self.n_cont != 0:
x = torch.cat([x, x_numerical], dim=1)
# Linear layer for numerical variables
x = self.linear_layers(x)
# Hidden layers
for hidden_layer in self.hidden_layers:
x = hidden_layer(x)
# Regression output
for hidden_layer in self.regression_hidden_layers:
x = hidden_layer(x)
regression_output = self.regression_output(x)
return regression_output
def forward_classification(self, x_numerical, x_categorical):
# Embedding layer for categorical variables
x_categorical = [e(x_categorical[:,i]) for i,e in enumerate(self.embedding_layers)]
x = torch.cat(x_categorical, 1)
if self.n_cont != 0:
x = torch.cat([x, x_numerical], dim=1)
# Linear layer for numerical variables
x = self.linear_layers(x)
# Hidden layers
for hidden_layer in self.hidden_layers:
x = hidden_layer(x)
# Classification output
for hidden_layer in self.classification_hidden_layers:
x = hidden_layer(x)
classification_output = torch.sigmoid(self.classification_output(x))
return classification_output
def forward(self, x_numerical, x_categorical):
regression_output = self.forward_regression(x_numerical, x_categorical)
classification_output = self.forward_classification(x_numerical, x_categorical)
return regression_output, classification_output
# return regression_output * classification_output
# +
m = EmbeddingTabularModel(0, emb_szs = get_emb_sz(dls), hidden_dims=[250, 100])
m = m.to('cuda')
m
dls.to('cuda')
# +
import torch.optim as optim
from tqdm import tqdm
mse_loss_fn = nn.MSELoss()
bce_loss_fn = nn.BCELoss()
merged_loss_fn = nn.MSELoss()
optimizer = optim.Adam(m.parameters(), lr = 1e-3)
# +
# Iterate over dataloader
max_iters = 1_000
total_rmse = 0
for step in tqdm(range(max_iters)):
for cat, cont, y in dls.train:
cat.to('cuda')
cont.to('cuda')
y.to('cuda')
# Classification if turnover if 0 or 1
y_class = (y != 0).type(torch.float32)
mask = (y != 0)
# apply the condition using torch.where
y = torch.where(mask, torch.log(y), torch.zeros_like(y))
# pred = m(cont, cat)
reg_y_pred, clas_y_pred = m(cont, cat)
# Calculate losses
regression_loss = mse_loss_fn(reg_y_pred, y)
classification_loss = bce_loss_fn(clas_y_pred, y_class)
merged = reg_y_pred * clas_y_pred
merged_loss = merged_loss_fn(merged, y)
# Total loss
total_loss = regression_loss + classification_loss + 3*merged_loss
# Backward pass and weight update
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
# Calculate RMSE on validation data
with torch.no_grad():
val_cat, val_cont, val_y = dls.valid.one_batch()
# y_class_test = (val_y != 0).type(torch.float32)
mask = (val_y != 0)
# apply the condition using torch.where
val_y = torch.where(mask, torch.log(val_y), torch.zeros_like(val_y))
pred_test_reg, pred_class_reg = m(val_cont, val_cat)
# import pdb; pdb.set_trace()
pred_test = pred_test_reg * pred_class_reg
val_rmse = torch.sqrt(mse_loss_fn(pred_test.exp(), val_y.exp()))
total_rmse += val_rmse
if step % 100 == 0 or step == max_iters - 1:
print("STEP:", step)
print("Regression loss:", regression_loss.item())
# print("Classification loss:", classification_loss.item())
print("Total loss:", total_loss.item())
print("Validation RMSE:", total_rmse.item()/(step+1))