I am testing the dog_breeds notebook and ran into an error where numpy couldnt average stacked torch cuda tensors during validation. I fixed it by placing .cpu() for each value as they show up.

```
def validate(stepper, dl, metrics, epoch, seq_first=False, validate_skip = 0):
if epoch < validate_skip: return [float('nan')] + [float('nan')] * len(metrics)
batch_cnts,loss,res = [],[],[]
stepper.reset(False)
with no_grad_context():
t = tqdm(iter(dl), leave=False, total=len(dl), miniters=0, desc='Validation')
for (*x,y) in t:
y = VV(y)
preds, l = stepper.evaluate(VV(x), y)
batch_cnts.append(batch_sz(x, seq_first=seq_first))
loss.append(to_np(l))
res.append([f(datafy(preds), datafy(y)).cpu() for f in metrics]) # <-- Added .cpu() here
return [np.average(loss, 0, weights=batch_cnts)] + list(np.average(np.stack(res), 0, weights=batch_cnts))
```

Not sure if that’s actually a good fix, or just a hack, but hopefully if you don’t know about already it saves you time…