Custom model with custom databunch

Hi,

I’m working towards creating a complex custom databunch to work with a custom pytorch module. I decided to start slow with a simple dataset and a simple data and build up from there. I’m porting this tutorial (which is written in Keras) with this dataset. This is a binary classification problem.

I loaded the csv file into a df and am following the steps in the docs to create my databunch. I used CategoryList since this is a binary classification problem. The df does not have a header columns and has 8 input features with the last column as the dependent variable.

train_df = pd.read_csv(data_path/'train.csv', header=None)
train.head()
	0	1	2	3	4	5	6	7	8
0	7	124	70	33	215	25.5	0.161	37	0
1	3	173	78	39	185	33.8	0.970	31	1
2	11	138	76	0	0	33.2	0.420	35	0
3	7	196	90	0	0	39.8	0.451	41	1
4	9	112	82	24	0	28.2	1.282	50	1
db = (CategoryList.from_df(train_df, cols=list(range(8)))
      .random_split_by_pct(valid_pct=0.1, seed=42)
     )

No problems till here. Next I want to specify the labels which is in the last column, column number 8:

db = db.label_from_df(cols=8, label_cls=CategoryList)

and I get the following error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-34-747806db2164> in <module>
----> 1 db = db.label_from_df(cols=8, label_cls=CategoryList)

~/fastai/fastai/data_block.py in _inner(*args, **kwargs)
    431             self.valid = fv(*args, **kwargs)
    432             self.__class__ = LabelLists
--> 433             self.process()
    434             return self
    435         return _inner

~/fastai/fastai/data_block.py in process(self)
    481         "Process the inner datasets."
    482         xp,yp = self.get_processors()
--> 483         for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
    484         #progress_bar clear the outputs so in some case warnings issued during processing disappear.
    485         for ds in self.lists:

~/fastai/fastai/data_block.py in process(self, xp, yp, name)
    655                     p.warns = []
    656                 self.x,self.y = self.x[~filt],self.y[~filt]
--> 657         self.x.process(xp)
    658         return self
    659 

~/fastai/fastai/data_block.py in process(self, processor)
     70         if processor is not None: self.processor = processor
     71         self.processor = listify(self.processor)
---> 72         for p in self.processor: p.process(self)
     73         return self
     74 

~/fastai/fastai/data_block.py in process(self, ds)
    299 
    300     def process(self, ds):
--> 301         if self.classes is None: self.create_classes(self.generate_classes(ds.items))
    302         ds.classes = self.classes
    303         ds.c2i = self.c2i

~/fastai/fastai/data_block.py in generate_classes(self, items)
    290     def generate_classes(self, items):
    291         "Generate classes from `items` by taking the sorted unique values."
--> 292         return uniqueify(items)
    293 
    294     def process_one(self,item):

~/fastai/fastai/core.py in uniqueify(x)
     73 def uniqueify(x:Series)->List:
     74     "Return sorted unique values of `x`."
---> 75     res = list(OrderedDict.fromkeys(x).keys())
     76     res.sort()
     77     return res

TypeError: unhashable type: 'numpy.ndarray'

Thanks.

I tried something different. I extracted the dependent variable into a list so that I can use label_from_list method. I also used split_by_idx for the random split and setup my dataframe accordingly. However, I still get the same typeerror:

y = train_df.iloc[:, 8].values
train_df.drop(columns=8, inplace=True)

def get_rdm_idx(arr, pct=0.2):
    return np.sort(np.random.choice(np.arange(len(arr)), np.int(np.ceil(pct*len(arr)))))

db = (CategoryList.from_df(train_df, cols=list(range(len(train_df.columns))))
      .split_by_idx(get_rdm_idx(train_df))
     )
db = db.label_from_list(y)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-6-6fa0bbda87ba> in <module>
----> 1 db = db.label_from_list(y)

~/fastai/fastai/data_block.py in _inner(*args, **kwargs)
    431             self.valid = fv(*args, **kwargs)
    432             self.__class__ = LabelLists
--> 433             self.process()
    434             return self
    435         return _inner

~/fastai/fastai/data_block.py in process(self)
    481         "Process the inner datasets."
    482         xp,yp = self.get_processors()
--> 483         for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
    484         #progress_bar clear the outputs so in some case warnings issued during processing disappear.
    485         for ds in self.lists:

~/fastai/fastai/data_block.py in process(self, xp, yp, name)
    655                     p.warns = []
    656                 self.x,self.y = self.x[~filt],self.y[~filt]
--> 657         self.x.process(xp)
    658         return self
    659 

~/fastai/fastai/data_block.py in process(self, processor)
     70         if processor is not None: self.processor = processor
     71         self.processor = listify(self.processor)
---> 72         for p in self.processor: p.process(self)
     73         return self
     74 

~/fastai/fastai/data_block.py in process(self, ds)
    299 
    300     def process(self, ds):
--> 301         if self.classes is None: self.create_classes(self.generate_classes(ds.items))
    302         ds.classes = self.classes
    303         ds.c2i = self.c2i

~/fastai/fastai/data_block.py in generate_classes(self, items)
    290     def generate_classes(self, items):
    291         "Generate classes from `items` by taking the sorted unique values."
--> 292         return uniqueify(items)
    293 
    294     def process_one(self,item):

~/fastai/fastai/core.py in uniqueify(x)
     73 def uniqueify(x:Series)->List:
     74     "Return sorted unique values of `x`."
---> 75     res = list(OrderedDict.fromkeys(x).keys())
     76     res.sort()
     77     return res

TypeError: unhashable type: 'numpy.ndarray'

CategoryList is for a single column of categorical data (usually a target in classification). You should use something like TabularList for your inputs.