Using batches in CUDA for Autoencoder

Considering this model for an autoencoder, I am unable to fit it within a NVIDIA GTX 1070 GPU. How can I use batches to fit this within my GPU?

class Autoencoder(nn.Module):
    def __init__(self, ):
        super(Autoencoder, self).__init__()
        self.fc1 = nn.Linear(NUM_COLS, 25000)
        self.fc2 = nn.Linear(25000, 15000)
        self.fc3 = nn.Linear(15000, 2000)
        self.fc4 = nn.Linear(2000, 500)
        self.fc5 = nn.Linear(500, 100)
        self.fc6 = nn.Linear(100, 500)
        self.fc7 = nn.Linear(500, 2000)
        self.fc8 = nn.Linear(2000, 15000)
        self.fc9 = nn.Linear(15000, 25000)
        self.fc10 = nn.Linear(25000, NUM_COLS)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):
        x1 = self.relu(self.fc1(x))
        x2 = self.relu(self.fc2(x1))
        x3 = self.relu(self.fc3(x2))
        x4 = self.relu(self.fc4(x3))
        x5 = self.relu(self.fc5(x4))
        x6 = self.relu(self.fc6(x5))
        x7 = self.relu(self.fc7(x6))
        x8 = self.relu(self.fc8(x7))
        x9 = self.relu(self.fc9(x8))
        x10 = self.relu(self.fc10(x9))
        return x1, x2, x3, x4, x5, x6, x7, x8, x9, x10


model = Autoencoder().double()
if cuda:
    model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

t_X = Variable(torch.from_numpy(X), requires_grad=False)
if cuda:
    t_X = t_X.cuda()
print('Start training')
for epoch in range(num_epochs):
    # ===================forward=====================
    output = model(t_X)[-1]
    loss = criterion(output, t_X)
    # ===================backward====================
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.data[0]))