@jeremy
What if we update the learning rate using backprop? Why is it a bad idea? Or is it?
Something like this:
import torch
import numpy as np
torch.manual_seed(42)
np.random.seed(42)
N = 64
alpha = 1.3
beta = np.array([[1.9],[1.5]])#
x_data = np.random.randn(N, 2)#
y_data = x_data.dot(beta) + alpha
x=torch.from_numpy(x_data).float()
y=torch.from_numpy(y_data).float()
w_beta = torch.randn((2, 1), requires_grad=True)
w_alpha = torch.randn(1, requires_grad=True)
learning_rate = torch.tensor(0.01, requires_grad=True)
torch.autograd.set_detect_anomaly(False)
for t in range(10):
learning_rate.detach_().requires_grad_()
w_alpha.detach_().requires_grad_()
w_beta.detach_().requires_grad_()
y_pred = x.mm(w_beta).add(w_alpha)
loss = (y_pred - y).pow(2).sum()
loss.backward(retain_graph=True, create_graph=True)
print(loss.item())
saved_w_beta_gra = w_beta.grad.detach().clone()
saved_w_alpha_gra = w_alpha.grad.detach().clone()
w_beta2 = w_beta - (w_beta.grad*learning_rate)
w_alpha2= w_alpha - (w_alpha.grad*learning_rate)
y_pred = x.mm(w_beta2).add(w_alpha2)
loss = (y_pred - y).pow(2).sum()
loss.backward(retain_graph=True, create_graph=True)
print("HERE",learning_rate,learning_rate.grad)
with torch.no_grad():
learning_rate -= 0.0000001*learning_rate.grad
w_beta = w_beta - (saved_w_beta_gra*learning_rate)
w_alpha= w_alpha - (saved_w_alpha_gra *learning_rate)