Pytorch Embedding Hurdle model for regression with lot of zeros

petrhrobar · March 22, 2023, 12:40pm

I have a PyTorch “Embedding Hurdle” model that deals with a lot of categorical variables and predicting a continuous variable that can be zero in some instances (50% of training data).

In statistics, we would have two models: one for classification if the prediction is different from zero and one for making the prediction of the continuous variable. I am using fastai tabular pandas for handling tabular PyTorch logic.

I want to ask if anyone has experience with this approach and has any tips. Also, I noticed that the model takes a pretty slow time to train, despite having a small dataset. The code for the model is available on StackOverflow.

Example code and training loop:


# +
import pandas as pd
from fastai.tabular.all import *

def EndSplitter(valid_pct=0.2, valid_last=True):
    "Create function that splits items between train/val with valid_pct at the end if valid_last else at the start. Useful for ordered data."
    assert 0<valid_pct<1, "valid_pct must be in (0,1)"
    def _inner(o):
        idxs = range_of(o)
        cut = int(valid_pct * len(o))
        return (idxs[:-cut], idxs[-cut:]) if valid_last else (idxs[cut:],idxs[:cut])
    return _inner

df = pd.read_csv("data_files.csv")

y_name = 'y'
cat_vars = ["day_of_week", "is_holiday", "is_day_before_holiday", "year", "month", "week_of_year", "cat1", "cat2","cat3", "cat4", "cat5"]
cont_vars = []

procs = [Normalize, Categorify, FillMissing]
# df["Y"] = df[y_name]
# MIN = df["Y"].min()
# MAX = df["Y"].max()

splits =  EndSplitter(valid_pct=0.15)(range_of(df))

# +
to = TabularPandas(
    df,
    procs=procs,
    cat_names=cat_vars,
    cont_names=cont_vars,
    y_names=[y_name],
    splits=splits,
    device='cuda',
    y_block=RegressionBlock(n_out = 1)
)

dls = to.dataloaders(
    bs=int(512 * 3), shuffle_train=True
)  # pytorch data loader every batch around 2k data points


# -

class EmbeddingTabularModel(nn.Module):
    def __init__(self, num_numerical_features, emb_szs, hidden_dims, dropout_prob=0.1, output_hidden_dims=[16, 8]):
        super(EmbeddingTabularModel, self).__init__()
        
        self.n_cont = num_numerical_features
        self.embedding_layers = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        n_emb = sum(e.embedding_dim for e in self.embedding_layers)
        self.n_emb= n_emb
        
        # Linear layers for numerical variables
        self.linear_layers = nn.Linear(n_emb, hidden_dims[0])
        
        # Hidden layers
        self.hidden_layers = nn.ModuleList([nn.Sequential(
                                                nn.Linear(hidden_dims[i], hidden_dims[i+1]),
                                                nn.BatchNorm1d(hidden_dims[i+1]),
                                                nn.Dropout(p=dropout_prob),
                                                nn.ReLU()
                                            )
                                            for i in range(len(hidden_dims)-1)])
        
        # Output layers
        self.regression_hidden_layers = nn.ModuleList([nn.Sequential(
                                                            nn.Linear(hidden_dims[-1], output_hidden_dims[0]),
                                                            nn.BatchNorm1d(output_hidden_dims[0]),
                                                            nn.Dropout(p=dropout_prob),
                                                            nn.ReLU()
                                                        ),
                                                        nn.Sequential(
                                                            nn.Linear(output_hidden_dims[0], output_hidden_dims[1]),
                                                            nn.BatchNorm1d(output_hidden_dims[1]),
                                                            nn.Dropout(p=dropout_prob),
                                                            nn.ReLU()
                                                        )
                                                      ])
        self.regression_output = nn.Linear(output_hidden_dims[-1], 1)
        
        self.classification_hidden_layers = nn.ModuleList([nn.Sequential(
                                                                nn.Linear(hidden_dims[-1], output_hidden_dims[0]),
                                                                nn.BatchNorm1d(output_hidden_dims[0]),
                                                                nn.Dropout(p=dropout_prob),
                                                                nn.ReLU()
                                                            ),
                                                            nn.Sequential(
                                                                nn.Linear(output_hidden_dims[0], output_hidden_dims[1]),
                                                                nn.BatchNorm1d(output_hidden_dims[1]),
                                                                nn.Dropout(p=dropout_prob),
                                                                nn.ReLU()
                                                            )
                                                          ])
        self.classification_output = nn.Linear(output_hidden_dims[-1], 1)
    
    def forward_regression(self, x_numerical, x_categorical):
        # Embedding layer for categorical variables
        x_categorical = [e(x_categorical[:,i]) for i,e in enumerate(self.embedding_layers)]
        x = torch.cat(x_categorical, 1)    
        if self.n_cont != 0:
            x = torch.cat([x, x_numerical], dim=1)
        
        # Linear layer for numerical variables
        x = self.linear_layers(x)
        
        # Hidden layers
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            
        # Regression output
        for hidden_layer in self.regression_hidden_layers:
            x = hidden_layer(x)
        regression_output = self.regression_output(x)
        
        return regression_output
    
    def forward_classification(self, x_numerical, x_categorical):
        # Embedding layer for categorical variables
        x_categorical = [e(x_categorical[:,i]) for i,e in enumerate(self.embedding_layers)]
        x = torch.cat(x_categorical, 1)    
        if self.n_cont != 0:
            x = torch.cat([x, x_numerical], dim=1)
        
        # Linear layer for numerical variables
        x = self.linear_layers(x)
        
        # Hidden layers
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            
        # Classification output
        for hidden_layer in self.classification_hidden_layers:
            x = hidden_layer(x)
        classification_output = torch.sigmoid(self.classification_output(x))
        
        return classification_output
    
    def forward(self, x_numerical, x_categorical):
        regression_output = self.forward_regression(x_numerical, x_categorical)
        classification_output = self.forward_classification(x_numerical, x_categorical)
        
        return regression_output, classification_output
#         return regression_output * classification_output


# +
m = EmbeddingTabularModel(0, emb_szs = get_emb_sz(dls), hidden_dims=[250, 100])
m = m.to('cuda')
m

dls.to('cuda')

# +
import torch.optim as optim
from tqdm import tqdm

mse_loss_fn = nn.MSELoss()
bce_loss_fn = nn.BCELoss()
merged_loss_fn = nn.MSELoss()

optimizer = optim.Adam(m.parameters(), lr = 1e-3)

# +
# Iterate over dataloader
max_iters = 1_000
total_rmse = 0
for step in tqdm(range(max_iters)):
    for cat, cont, y in dls.train:
        cat.to('cuda')
        cont.to('cuda')
        y.to('cuda')

        # Classification if turnover if 0 or 1
        y_class = (y != 0).type(torch.float32)
        mask = (y != 0)
        # apply the condition using torch.where
        y = torch.where(mask, torch.log(y), torch.zeros_like(y))

#         pred = m(cont, cat)
        reg_y_pred, clas_y_pred = m(cont, cat)

        # Calculate losses
        regression_loss = mse_loss_fn(reg_y_pred, y)
        classification_loss = bce_loss_fn(clas_y_pred, y_class)
        merged = reg_y_pred * clas_y_pred
        merged_loss = merged_loss_fn(merged, y)
        
        # Total loss
        total_loss = regression_loss + classification_loss + 3*merged_loss

        # Backward pass and weight update
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        
        # Calculate RMSE on validation data
        with torch.no_grad():
            val_cat, val_cont, val_y = dls.valid.one_batch()
            
#             y_class_test = (val_y != 0).type(torch.float32)
            mask = (val_y != 0)
            # apply the condition using torch.where
            val_y = torch.where(mask, torch.log(val_y), torch.zeros_like(val_y))
            
            pred_test_reg, pred_class_reg = m(val_cont, val_cat)
#             import pdb; pdb.set_trace()
            pred_test = pred_test_reg * pred_class_reg
            val_rmse = torch.sqrt(mse_loss_fn(pred_test.exp(), val_y.exp()))
            total_rmse += val_rmse

    if step % 100 == 0 or step == max_iters - 1:
        print("STEP:", step)
        print("Regression loss:", regression_loss.item())
#         print("Classification loss:", classification_loss.item())
        print("Total loss:", total_loss.item())
        print("Validation RMSE:", total_rmse.item()/(step+1))