Autoencoder for dimensionality reduction of large sparse non image data

Hi all,
I have a large matrix with 200k data points. Each data point has 20M sparse features. This is not an image data, it is more like TF_IDF data. I want to reduce the dimension from 20M to 1000. The PCA is very slow. I want to use autoencoders or t-SNE. The size of data that I have is in order of 100GB.

My question:
Is PyTorch can handle this task and are there any implementation for autoencoders that I can use for reducing the dimension of the data?

Hi Ehsan,

I’ve been working on the Satander Competition on Kaggle, which I think might have a similar data structure(~4700 features of sparse data). I built an auto encoder in pytorch to reduce the dimensionality.

Below I posted the key components but I also have a jupyter notebook I could share with you if you’re interested. I also scaled the input data between 0 and 1 so the decoder has a sigmoid on the output. You may want to change that for your dataset.

Here was my dataloader:

class AETrainingData(Dataset):
    '''
        Format the training dataset to be input into the auto encoder.
        Takes in dataframe and converts it to a PyTorch Tensor
    '''
    
    def __init__(self, x_train):
        self.x = x_train
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        '''
            Returns a example from the data set as a pytorch tensor.
        '''
        # Get example/target pair at idx as numpy arrays
        x, y = self.x.iloc[idx].values, self.x.iloc[idx].values

        # Convert to torch tensor
        x = torch.from_numpy(x).type(torch.FloatTensor)
        y = torch.from_numpy(y).type(torch.FloatTensor)
        
        # Return pair        
        return {'input': x, 'target': y}

Encoder:

class Encoder(nn.Module):
    def __init__(self, input_shape, drop_prob=0):
        super(Encoder, self).__init__()
        self.drop_prob = drop_prob
        
        self.e1 = nn.Linear(input_shape, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        
        self.e2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        
        self.e3 = nn.Linear(1024, 512)
        self.bn3 = nn.BatchNorm1d(512)
        
        self.e4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        
        self.e5 = nn.Linear(256, 50)
        
    def forward(self, input):
        
        block1 = F.dropout(self.bn1(F.elu(self.e1(input))), p=self.drop_prob)
        block2 = F.dropout(self.bn2(F.elu(self.e2(block1))), p=self.drop_prob)
        block3 = F.dropout(self.bn3(F.elu(self.e3(block2))), p=self.drop_prob)
        block4 = F.dropout(self.bn4(F.elu(self.e4(block3))), p=self.drop_prob)
        encoded_representation = F.tanh(self.e5(block4))
        return encoded_representation

Decoder:

class Decoder(nn.Module):
    def __init__(self, output_shape, drop_prob=0):
        super(Decoder, self).__init__()
        self.drop_prob = drop_prob
        
        self.d = nn.Linear(50, 256)
        self.bn = nn.BatchNorm1d(256)
        
        self.d1 = nn.Linear(256, 512)
        self.bn1 = nn.BatchNorm1d(512)
        
        self.d2 = nn.Linear(512, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        
        self.d3 = nn.Linear(1024, 2048)
        self.bn3 = nn.BatchNorm1d(2048)
        
        self.d4 = nn.Linear(2048, output_shape)
        
    
    def forward(self, input):
        block = F.dropout(self.bn(F.elu(self.d(input))), p=self.drop_prob)
        block1 = F.dropout(self.bn1(F.elu(self.d1(block))), p=self.drop_prob)
        block2 = F.dropout(self.bn2(F.elu(self.d2(block1))), p=self.drop_prob)
        block3 = F.dropout(self.bn3(F.elu(self.d3(block2))), p=self.drop_prob)
        reconstruction = F.sigmoid(self.d4(block3))
        return reconstruction

training function:

def train_ae(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion):
    
    # clear the gradients in the optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # Forward pass through 
    
    encoded_representation = encoder(input_tensor)
    reconstruction = decoder(encoded_representation)
    
    # Compute the loss
    loss = criterion(reconstruction, target_tensor)
    
    # Compute the gradients
    loss.backward()
    
    # Step the optimizers to update the model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Return the loss value to track training progress
    return loss.item()

Training loop:

def trainIters(encoder, decoder, dataloader, epochs, print_every_n_batches=100, learning_rate=0.001):
    
    # keep track of losses
    plot_losses = []

    # Initialize Encoder Optimizer
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=1e-8)
    
    # Initialize Decoder Optimizer
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=1e-8)

    # Specify loss function
    criterion = nn.MSELoss(reduce=True)
    
    # Cycle through epochs
    for epoch in range(epochs):

        print(f'Epoch {epoch + 1}/{epochs}')
        # Cycle through batches
        for i, batch in enumerate(dataloader):
            
            input_tensor = batch['input'].to(device)
            target_tensor = batch['target'].to(device)
            

            loss = train_ae(input_tensor, target_tensor, encoder, decoder,
                         encoder_optimizer, decoder_optimizer, criterion)
            

            if i % print_every_n_batches == 0 and i != 0:
                print(loss)
                plot_losses.append(loss)
    return plot_losses
3 Likes

Hi Macbrennan,

Thanks a lot. It would be great if you can share your jupyter notebook with me.

Cheers,
Ehsan

Hi macbrennan,

can you share your jupyter notebook with me too? Thanks!