Why is it a bad idea to update LR with gradient descent?

@jeremy
What if we update the learning rate using backprop? Why is it a bad idea? Or is it?

Something like this:

import torch
import numpy as np
torch.manual_seed(42)
np.random.seed(42)

N = 64

alpha = 1.3
beta = np.array([[1.9],[1.5]])#

x_data = np.random.randn(N, 2)#
y_data = x_data.dot(beta) + alpha

x=torch.from_numpy(x_data).float()
y=torch.from_numpy(y_data).float()

w_beta = torch.randn((2, 1), requires_grad=True)
w_alpha = torch.randn(1, requires_grad=True)

learning_rate = torch.tensor(0.01, requires_grad=True)

torch.autograd.set_detect_anomaly(False)
for t in range(10):
learning_rate.detach_().requires_grad_()
w_alpha.detach_().requires_grad_()
w_beta.detach_().requires_grad_()

y_pred = x.mm(w_beta).add(w_alpha)
loss = (y_pred - y).pow(2).sum()


loss.backward(retain_graph=True, create_graph=True)
print(loss.item())
saved_w_beta_gra =  w_beta.grad.detach().clone()
saved_w_alpha_gra =  w_alpha.grad.detach().clone()
w_beta2 = w_beta - (w_beta.grad*learning_rate)
w_alpha2= w_alpha - (w_alpha.grad*learning_rate)


y_pred = x.mm(w_beta2).add(w_alpha2)
loss = (y_pred - y).pow(2).sum()

loss.backward(retain_graph=True, create_graph=True)

print("HERE",learning_rate,learning_rate.grad)
with torch.no_grad():
    learning_rate -= 0.0000001*learning_rate.grad
    w_beta = w_beta - (saved_w_beta_gra*learning_rate)
    w_alpha= w_alpha - (saved_w_alpha_gra *learning_rate)

There is a paper that does a similar thing:

Besides: It needs too much memory! and I guess the second learning-rate is really sensitive