Edit: Looks like I fixed it for now manually finding and converting the columns.
df['date_Week'] = df['date_Week'].astype(float)
This seems like some leaky abstraction between Fast.ai and Pandas doing the week conversion, the conversion back to float is given in the documentation regarding the experimental nullable integer type here: https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#integer-na
Original post:
I’m essentially following Chapter 9, tabular data, though using my own data to get a baseline to start working with Fast.ai from.
I’m running into issues with missing dates.
According to https://docs.fast.ai/tabular.core.html#add_datepart None dates should be handled, but for me the week part seems to be wrong. Any easy workaround to this or am I missing something?
The data is loaded into TabularPandas pretty much exactly as given in the book.
# Earlier defining my data
make_date(df, 'date')
df = add_datepart(df, 'date')
# Later loading it.
splits = RandomSplitter()(range_of(sd_df))
procs = [Categorify, FillMissing]
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
The weird week part.
df['date_Week'].unique()
<IntegerArray>
[<NA>, 13, 17, 5, 22, 15, 31, 44, 47, 11, 27, 52, 49,
40, 23, 43, 26, 48, 12, 37, 36, 1, 30, 25, 42, 45,
28, 46, 33, 41, 7, 29, 39, 21, 6, 24, 16, 3, 50,
19, 8, 2, 9, 38, 51, 10, 35, 14, 4, 32, 34, 18,
20, 53]
Length: 54, dtype: UInt32
While day which seems to be working correctly giving:
df['date_day'].unique()
array([nan, 30., 21., 6., 1., 14., 31., 28., 15., 3., 5., 12., 22., 23., 29., 19., 11., 27., 7., 20., 25., 16., 18., 8., 17., 24., 10., 13., 4., 2., 26., 9.])
The entire error is as follows:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-27-d26047aee92e> in <module>
8 procs = [Categorify, FillMissing]
9 cont, cat = cont_cat_split(df, 1, dep_var=dep_var)
---> 10 to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
11
12 save_pickle('./tabular_data.pkl',to)
~\Anaconda3\envs\fastai\lib\site-packages\fastai\tabular\core.py in __init__(self, df, procs, cat_names, cont_names, y_names, y_block, splits, do_setup, device, inplace, reduce_memory)
163 self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
164 self.split = len(df) if splits is None else len(splits[0])
--> 165 if do_setup: self.setup()
166
167 def new(self, df):
~\Anaconda3\envs\fastai\lib\site-packages\fastai\tabular\core.py in setup(self)
174 def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
175 def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
--> 176 def setup(self): self.procs.setup(self)
177 def process(self): self.procs(self)
178 def loc(self): return self.items.loc
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in setup(self, items, train_setup)
190 tfms = self.fs[:]
191 self.fs.clear()
--> 192 for t in tfms: self.add(t,items, train_setup)
193
194 def add(self,t, items=None, train_setup=False):
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in add(self, t, items, train_setup)
193
194 def add(self,t, items=None, train_setup=False):
--> 195 t.setup(items, train_setup)
196 self.fs.append(t)
197
~\Anaconda3\envs\fastai\lib\site-packages\fastai\tabular\core.py in setup(self, items, train_setup)
219 "Base class to write a non-lazy tabular processor for dataframes"
220 def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
--> 221 super().setup(getattr(items,'train',items), train_setup=False)
222 # Procs are called as soon as data is available
223 return self(items.items if isinstance(items,Datasets) else items)
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in setup(self, items, train_setup)
77 def setup(self, items=None, train_setup=False):
78 train_setup = train_setup if self.train_setup is None else self.train_setup
---> 79 return self.setups(getattr(items, 'train', items) if train_setup else items)
80
81 def _call(self, fn, x, split_idx=None, **kwargs):
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\dispatch.py in __call__(self, *args, **kwargs)
115 elif self.inst is not None: f = MethodType(f, self.inst)
116 elif self.owner is not None: f = MethodType(f, self.owner)
--> 117 return f(*args, **kwargs)
118
119 def __get__(self, inst, owner):
~\Anaconda3\envs\fastai\lib\site-packages\fastai\tabular\core.py in setups(self, to)
238 order = 1
239 def setups(self, to):
--> 240 store_attr(classes={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names})
241
242 def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
~\Anaconda3\envs\fastai\lib\site-packages\fastai\tabular\core.py in <dictcomp>(.0)
238 order = 1
239 def setups(self, to):
--> 240 store_attr(classes={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names})
241
242 def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
~\Anaconda3\envs\fastai\lib\site-packages\fastai\data\transforms.py in __init__(self, col, sort, add_na, strict)
212 if not hasattr(col,'unique'): col = L(col, use_list=True)
213 # `o==o` is the generalized definition of non-NaN used by Pandas
--> 214 items = L(o for o in col.unique() if o==o)
215 if sort: items = items.sorted()
216 self.items = '#na#' + items if add_na else items
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\foundation.py in __call__(cls, x, *args, **kwargs)
120 def __call__(cls, x=None, *args, **kwargs):
121 if not args and not kwargs and x is not None and isinstance(x,cls): return x
--> 122 return super().__call__(x, *args, **kwargs)
123
124 # Cell
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\foundation.py in __init__(self, items, use_list, match, *rest)
128 def __init__(self, items=None, *rest, use_list=False, match=None):
129 if (use_list is not None) or not is_array(items):
--> 130 items = listify(items, *rest, use_list=use_list, match=match)
131 super().__init__(items)
132
~\Anaconda3\envs\fastai\lib\site-packages\fastcore\basics.py in listify(o, use_list, match, *rest)
54 elif isinstance(o, list): res = o
55 elif isinstance(o, str) or is_array(o): res = [o]
---> 56 elif is_iter(o): res = list(o)
57 else: res = [o]
58 if match is not None:
~\Anaconda3\envs\fastai\lib\site-packages\fastai\data\transforms.py in <genexpr>(.0)
212 if not hasattr(col,'unique'): col = L(col, use_list=True)
213 # `o==o` is the generalized definition of non-NaN used by Pandas
--> 214 items = L(o for o in col.unique() if o==o)
215 if sort: items = items.sorted()
216 self.items = '#na#' + items if add_na else items
pandas\_libs\missing.pyx in pandas._libs.missing.NAType.__bool__()
TypeError: boolean value of NA is ambiguous