Can you please look at the following code and let me know why my loss doesn’t improve?
# 28X28 weights
w = tensor([random.random()-0.5 for i in range(28*28)]).requires_grad_()
b= tensor(random.random()-0.5).requires_grad_()
def sigmoid(x):
return 1/(1+torch.exp(-x))
def predict(x):
return sigmoid(x@w.T+b)
def error(pred,y):
return torch.where(y==1,1-pred,pred)
dl = DataLoader(dset,256,shuffle=True)
lr=1
print ('starting loss:'+str(error(predict(train_X),train_y).mean()))
for x,y in dl:
error(predict(x),y).mean().backward()
a= w.grad
print('change in grad:'+str(a.sum())+' change in w:'+str(w.sum()))
w.data -= lr*a
b.data -= lr*b.grad.data
w.grad.zero_()
b.grad.zero_()
print ('final loss'+str(error(predict(train_X),train_y).mean()))
Here’s the result of running the last loop. as you can see the weights do change but the overall loss doesn’t.
starting loss:tensor(0.4955, grad_fn=)
change in grad:tensor(-0.0512) change in w:tensor(4.7489, grad_fn=)
change in grad:tensor(-0.1743) change in w:tensor(4.8002, grad_fn=)
change in grad:tensor(-0.1798) change in w:tensor(4.9744, grad_fn=)
change in grad:tensor(0.0791) change in w:tensor(5.1542, grad_fn=)
change in grad:tensor(-0.0504) change in w:tensor(5.0751, grad_fn=)
change in grad:tensor(0.0676) change in w:tensor(5.1255, grad_fn=)
change in grad:tensor(-0.0116) change in w:tensor(5.0579, grad_fn=)
change in grad:tensor(-0.1402) change in w:tensor(5.0694, grad_fn=)
change in grad:tensor(0.0881) change in w:tensor(5.2097, grad_fn=)
change in grad:tensor(0.0187) change in w:tensor(5.1215, grad_fn=)
change in grad:tensor(-0.1067) change in w:tensor(5.1029, grad_fn=)
change in grad:tensor(-0.0685) change in w:tensor(5.2095, grad_fn=)
change in grad:tensor(0.0827) change in w:tensor(5.2780, grad_fn=)
change in grad:tensor(0.) change in w:tensor(5.1954, grad_fn=)
change in grad:tensor(-0.0134) change in w:tensor(5.1954, grad_fn=)
change in grad:tensor(-0.0310) change in w:tensor(5.2087, grad_fn=)
change in grad:tensor(0.0160) change in w:tensor(5.2397, grad_fn=)
change in grad:tensor(0.1495) change in w:tensor(5.2238, grad_fn=)
change in grad:tensor(0.0106) change in w:tensor(5.0743, grad_fn=)
change in grad:tensor(0.0690) change in w:tensor(5.0637, grad_fn=)
change in grad:tensor(0.0298) change in w:tensor(4.9947, grad_fn=)
change in grad:tensor(-0.1751) change in w:tensor(4.9649, grad_fn=)
change in grad:tensor(-0.0475) change in w:tensor(5.1400, grad_fn=)
change in grad:tensor(0.0978) change in w:tensor(5.1875, grad_fn=)
change in grad:tensor(-0.0923) change in w:tensor(5.0897, grad_fn=)
change in grad:tensor(0.1903) change in w:tensor(5.1820, grad_fn=)
change in grad:tensor(0.0948) change in w:tensor(4.9917, grad_fn=)
change in grad:tensor(-0.1095) change in w:tensor(4.8969, grad_fn=)
change in grad:tensor(-0.0346) change in w:tensor(5.0064, grad_fn=)
change in grad:tensor(-0.0943) change in w:tensor(5.0410, grad_fn=)
change in grad:tensor(-0.0962) change in w:tensor(5.1354, grad_fn=)
change in grad:tensor(0.0682) change in w:tensor(5.2316, grad_fn=)
change in grad:tensor(-0.0953) change in w:tensor(5.1634, grad_fn=)
change in grad:tensor(0.0631) change in w:tensor(5.2587, grad_fn=)
change in grad:tensor(0.) change in w:tensor(5.1956, grad_fn=)
change in grad:tensor(0.0603) change in w:tensor(5.1956, grad_fn=)
change in grad:tensor(0.0427) change in w:tensor(5.1353, grad_fn=)
change in grad:tensor(0.0474) change in w:tensor(5.0925, grad_fn=)
change in grad:tensor(0.0746) change in w:tensor(5.0452, grad_fn=)
change in grad:tensor(-0.1346) change in w:tensor(4.9706, grad_fn=)
change in grad:tensor(-0.2334) change in w:tensor(5.1052, grad_fn=)
change in grad:tensor(-0.1048) change in w:tensor(5.3386, grad_fn=)
change in grad:tensor(0.) change in w:tensor(5.4434, grad_fn=)
change in grad:tensor(0.) change in w:tensor(5.4434, grad_fn=)
change in grad:tensor(0.0214) change in w:tensor(5.4434, grad_fn=)
change in grad:tensor(0.1048) change in w:tensor(5.4219, grad_fn=)
change in grad:tensor(-0.1088) change in w:tensor(5.3172, grad_fn=)
change in grad:tensor(-0.0883) change in w:tensor(5.4260, grad_fn=)
change in grad:tensor(-0.1138) change in w:tensor(5.5143, grad_fn=)
final losstensor(0.4955, grad_fn=)