Unexpected Char in Validation Set

gonnan · October 21, 2020, 5:43pm

Hello everybody!

I’ve been trying to apply the tabular_learner model to the Riid competition on Kaggle. But I’m running into some weird issues where on validation my model is running into a Char instead of a Float? I’ve been smashing my head against the wall trying to figure this out. I thought it was running into a ‘#na#’ because a validation category wasn’t in the training set. But I reduced the example to include only two validation items, both of which contain values that are seen in the training set.

dep_var = 'answered_correctly'

simple = X[['user_id', 'user_mean', 'answered_correctly']]
cont, cat = cont_cat_split(simple, 1, dep_var=dep_var)

cont, cat
# (['user_mean'], ['user_id'])

# Create a split for a really small validation set
split = int(0.99999 * len(simple))
train_index = simple.iloc[:split].index
test_index = simple.iloc[split:].index
splits = (list(train_index), list(test_index))

simple.loc[train_index].tail()
"""
user_id	user_mean	answered_correctly
4022163	0.716216	        1 
4022163	0.716216	        1
"""

# So the validation and training sets both contain the user_id 4022163
simple.loc[test_index]
"""
user_id	user_mean	answered_correctly
4022163	0.716216	        1
4022163	0.716216	        1
"""

procs = [Categorify, FillMissing, Normalize]

to = TabularPandas(simple, procs, cat, cont, y_names=dep_var, splits=splits) 
dls = to.dataloaders(2)

learn = tabular_learner(dls, t_range=(0,1), layers=[500, 250], n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(5, 1e-2)

The stack trace I get for this is

epoch	train_loss	valid_loss	time
0	0.620800	00:00
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-192-24cb127eeafb> in <module>
----> 1 learn.fit_one_cycle(5, 1e-2)

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch = n_epoch
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    177         self.iter = i
    178         self._split(b)
--> 179         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    180 
    181     def _do_epoch_train(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    168         if not self.training or not len(self.yb): return
    169         self('before_backward')
--> 170         self._backward()
    171         self('after_backward')
    172         self._step()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _backward(self)
    150 
    151     def _step(self): self.opt.step()
--> 152     def _backward(self): self.loss.backward()
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):

~/.local/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    183                 products. Defaults to ``False``.
    184         """
--> 185         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    186 
    187     def register_hook(self, hook):

~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    123         retain_graph = create_graph
    124 
--> 125     Variable._execution_engine.run_backward(
    126         tensors, grad_tensors, retain_graph, create_graph,
    127         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: Found dtype Char but expected Float
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:183 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fda948cc1e2 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0x259 (0x7fdad07b7849 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x7fdad07bafeb in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x7fdad07bb65d in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::native::mse_loss_backward_out(at::Tensor&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x18a (0x7fdad06202ba in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #5: at::native::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x90 (0x7fdad061cce0 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x10fa2f9 (0x7fdad0a2f2f9 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xa9ac76 (0x7fdad03cfc76 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #8: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdad0adf949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #9: <unknown function> + 0x2e03469 (0x7fdad2738469 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0xa9ac76 (0x7fdad03cfc76 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #11: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdad0adf949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::generated::MseLossBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1af (0x7fdad26740cf in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #13: <unknown function> + 0x3375bb7 (0x7fdad2caabb7 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7fdad2ca6400 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7fdad2ca6fa1 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::execute_with_graph_task(std::shared_ptr<torch::autograd::GraphTask> const&, std::shared_ptr<torch::autograd::Node>) + 0x37c (0x7fdad2ca46bc in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::python::PythonEngine::execute_with_graph_task(std::shared_ptr<torch::autograd::GraphTask> const&, std::shared_ptr<torch::autograd::Node>) + 0x3c (0x7fdae043acac in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #18: torch::autograd::Engine::execute(std::vector<torch::autograd::Edge, std::allocator<torch::autograd::Edge> > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, bool, bool, std::vector<torch::autograd::Edge, std::allocator<torch::autograd::Edge> > const&) + 0x803 (0x7fdad2ca39f3 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::python::PythonEngine::execute(std::vector<torch::autograd::Edge, std::allocator<torch::autograd::Edge> > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, bool, bool, std::vector<torch::autograd::Edge, std::allocator<torch::autograd::Edge> > const&) + 0x4e (0x7fdae043aaae in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #20: THPEngine_run_backward(THPEngine*, _object*, _object*) + 0x984 (0x7fdae043b6c4 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #21: PyCFunction_Call + 0x56 (0x55a157eb3f76 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #22: _PyObject_MakeTpCall + 0x22f (0x55a157e7185f in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #23: _PyEval_EvalFrameDefault + 0x11d0 (0x55a157ef5b90 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #24: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #25: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #26: <unknown function> + 0x10077f (0x55a157e3477f in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #27: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #28: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #29: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #30: _PyFunction_Vectorcall + 0x10b (0x55a157ebf86b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #31: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #32: <unknown function> + 0x18bc0b (0x55a157ebfc0b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #33: <unknown function> + 0xfeb84 (0x55a157e32b84 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #34: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #35: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #36: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #37: _PyFunction_Vectorcall + 0x10b (0x55a157ebf86b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #38: <unknown function> + 0x18be79 (0x55a157ebfe79 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #39: PyVectorcall_Call + 0x71 (0x55a157e71041 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #40: _PyEval_EvalFrameDefault + 0x1fdb (0x55a157ef699b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #41: <unknown function> + 0x18bc0b (0x55a157ebfc0b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #42: <unknown function> + 0xfeb84 (0x55a157e32b84 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #43: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #44: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #45: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #46: _PyFunction_Vectorcall + 0x10b (0x55a157ebf86b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #47: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #48: <unknown function> + 0x18bc0b (0x55a157ebfc0b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #49: <unknown function> + 0xfeb84 (0x55a157e32b84 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #50: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #51: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #52: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #53: <unknown function> + 0x18bc0b (0x55a157ebfc0b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #54: <unknown function> + 0xfeb84 (0x55a157e32b84 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #55: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #56: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #57: <unknown function> + 0x10075e (0x55a157e3475e in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #58: _PyEval_EvalCodeWithName + 0x2d2 (0x55a157ebea92 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #59: _PyFunction_Vectorcall + 0x1e3 (0x55a157ebf943 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #60: PyVectorcall_Call + 0x71 (0x55a157e71041 in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #61: _PyEval_EvalFrameDefault + 0x1fdb (0x55a157ef699b in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #62: _PyEval_EvalCodeWithName + 0x7df (0x55a157ebef9f in /home/gannon/anaconda3/envs/fastai2/bin/python)
frame #63: <unknown function> + 0x18bd20 (0x55a157ebfd20 in /home/gannon/anaconda3/envs/fastai2/bin/python)

Can anyone offer any insights into this problem? I found one thread where someone was running into a similar problem, but the solution offered was that the validation set contained categorical values not present in the training set, so they were marked as ‘%na’. But that doesn’t seem to be the case here?

Also if anyone has issues with how I’m presenting this problem, I’m definitely open to improve on that too. I read through the debugging suggestions thread, but I feel like this is still pretty ugly and maybe not that helpful for anyone trying to help?

Thanks a lot

darek.kleczek · October 22, 2020, 4:44am

Looking at your stack trace, it seems like model is failing on training set rather than validation (within the fit method). So the issue is in the training set rather than validation. Maybe that can help track it down, e.g. by reducing your training set to a single batch where you can manually inspect every example?

gonnan · October 22, 2020, 2:35pm

Ok interesting, thanks for the response!

I reduced my dataset down to 10 items of type int8 but I’m still getting the same error.

simple
# I imagine there's a better way to display DataFrames in markdown?
"""
user_id	user_mean	answered_correctly
0	115	     1	     1
1	115	     1	     1
2	115	     1	     1
3	115	     1	     1
4	115	     1	     1
5	115	     1	     1
6	115	     1	     1
7	115	     1	     1
8	115	     1	     1
9	115	     1	     1
"""

simple.dtypes
"""
user_id               int32
user_mean              int8
answered_correctly     int8
dtype: object
"""

procs = [Categorify, FillMissing, Normalize]

to = TabularPandas(simple, procs, cat, cont, y_names=dep_var, splits=splits) #, inplace=True)
dls = to.dataloaders(2)

dls.train.items.dtypes
"""
user_id               int8
user_mean             int8
answered_correctly    int8
dtype: object
"""
dls.train.items
"""
	user_id	user_mean	answered_correctly
0	1		1			1
1	1		1			1
2	1		1			1
3	1		1			1
4	1		1			1
5	1		1			1	
6	1		1			1
7	1		1			1
"""

dls.valid.items.dtypes
"""
user_id               int8
user_mean             int8
answered_correctly    int8
dtype: object
"""

dls.valid.items
"""
	user_id	user_mean	answered_correctly
8	1		1			1
9	1		1			1
"""

learn = tabular_learner(dls, t_range=(0,1), layers=[500, 250], n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(1, 1e-2)

Which gets me the same stack trace:

epoch	train_loss	valid_loss	time
0	1.103178	00:00
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-86-3ea49add0339> in <module>
----> 1 learn.fit_one_cycle(1, 1e-2)

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch = n_epoch
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    177         self.iter = i
    178         self._split(b)
--> 179         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    180 
    181     def _do_epoch_train(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    168         if not self.training or not len(self.yb): return
    169         self('before_backward')
--> 170         self._backward()
    171         self('after_backward')
    172         self._step()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _backward(self)
    150 
    151     def _step(self): self.opt.step()
--> 152     def _backward(self): self.loss.backward()
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):

~/.local/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    183                 products. Defaults to ``False``.
    184         """
--> 185         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    186 
    187     def register_hook(self, hook):

~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    123         retain_graph = create_graph
    124 
--> 125     Variable._execution_engine.run_backward(
    126         tensors, grad_tensors, retain_graph, create_graph,
    127         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: Found dtype Char but expected Float
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:183 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fdcfef3c1e2 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0x259 (0x7fdd3ae27849 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x7fdd3ae2afeb in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x7fdd3ae2b65d in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::native::mse_loss_backward_out(at::Tensor&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x18a (0x7fdd3ac902ba in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0xf2b190 (0x7fdd002d0190 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #6: at::native::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x90 (0x7fdd3ac8cce0 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xf2b230 (0x7fdd002d0230 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0xf4d4b6 (0x7fdd002f24b6 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #9: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdd3b14f949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x2e03469 (0x7fdd3cda8469 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0xa9ac76 (0x7fdd3aa3fc76 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #12: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdd3b14f949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::generated::MseLossBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1af (0x7fdd3cce40cf in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x3375bb7 (0x7fdd3d31abb7 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7fdd3d316400 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7fdd3d316fa1 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fdd3d30f119 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fdd4aaaa86a in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #19: <unknown function> + 0xc819d (0x7fdd7a0b819d in /home/gannon/anaconda3/envs/fastai2/lib/python3.8/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #20: <unknown function> + 0x9609 (0x7fdd7d01c609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #21: clone + 0x43 (0x7fdd7cf43103 in /lib/x86_64-linux-gnu/libc.so.6)

gonnan · October 22, 2020, 3:02pm

So I’ve produced an even smaller example with the same results:

minimal = pd.DataFrame([1, 1, 1, 1], columns=['first'])
minimal['second'] = 2

cont, cat = cont_cat_split(minimal, 1, dep_var='second')

minimal_splits = ([0, 1], [2, 3])

procs = [Categorify, FillMissing, Normalize]

to = TabularPandas(minimal, procs, cat, cont, y_names='second', splits=minimal_splits) #, inplace=True)
dls = to.dataloaders(2)

learn = tabular_learner(dls, t_range=(0,3), layers=[500, 250], n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(1, 2e-2)

Which gets the same stack trace:

epoch	train_loss	valid_loss	time
0	4.013793	00:00
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-112-07c3fbbfd15e> in <module>
----> 1 learn.fit_one_cycle(1, 2e-2)

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~/anaconda3/envs/fastai2/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

~/.local/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch = n_epoch
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    177         self.iter = i
    178         self._split(b)
--> 179         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    180 
    181     def _do_epoch_train(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    168         if not self.training or not len(self.yb): return
    169         self('before_backward')
--> 170         self._backward()
    171         self('after_backward')
    172         self._step()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _backward(self)
    150 
    151     def _step(self): self.opt.step()
--> 152     def _backward(self): self.loss.backward()
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):

~/.local/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    183                 products. Defaults to ``False``.
    184         """
--> 185         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    186 
    187     def register_hook(self, hook):

~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    123         retain_graph = create_graph
    124 
--> 125     Variable._execution_engine.run_backward(
    126         tensors, grad_tensors, retain_graph, create_graph,
    127         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: Found dtype Char but expected Float
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:183 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fdcfef3c1e2 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0x259 (0x7fdd3ae27849 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x7fdd3ae2afeb in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x7fdd3ae2b65d in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::native::mse_loss_backward_out(at::Tensor&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x18a (0x7fdd3ac902ba in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0xf2b190 (0x7fdd002d0190 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #6: at::native::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x90 (0x7fdd3ac8cce0 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xf2b230 (0x7fdd002d0230 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0xf4d4b6 (0x7fdd002f24b6 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #9: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdd3b14f949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x2e03469 (0x7fdd3cda8469 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0xa9ac76 (0x7fdd3aa3fc76 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #12: at::mse_loss_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, long) + 0x119 (0x7fdd3b14f949 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::generated::MseLossBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1af (0x7fdd3cce40cf in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x3375bb7 (0x7fdd3d31abb7 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7fdd3d316400 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7fdd3d316fa1 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fdd3d30f119 in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fdd4aaaa86a in /home/gannon/.local/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #19: <unknown function> + 0xc819d (0x7fdd7a0b819d in /home/gannon/anaconda3/envs/fastai2/lib/python3.8/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #20: <unknown function> + 0x9609 (0x7fdd7d01c609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #21: clone + 0x43 (0x7fdd7cf43103 in /lib/x86_64-linux-gnu/libc.so.6)

darek.kleczek · October 22, 2020, 4:56pm

If your loss function is mse_loss, then I think the target variable should be float (regression)? Or is it a classification task? Then you might need a different loss function, like cross entropy. Would that help?

gonnan · October 22, 2020, 10:51pm

Thank you you wonderful person! It is supposed to be a classification task, but I figured I should be able to do regression on it regardless, and I was so thrown by the char error. I feel stupid for not having changed my loss function initially, but I’m grateful you pointed it out. Thanks a lot!