Hi all,
I have a dataframe with 306487 rows by 658 features. The last column y
is the target variable to predict.
You may download this file, unpack it and then run the following to produce my error:
train_df = pd.read_pickle('data/train_df_fastai')
df, y, nas = proc_df(train_df, 'y')
The proc_df()
produces this error that I am unable to resolve.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in f(values, axis, skipna, **kwds)
127 else:
--> 128 result = alt(values, axis=axis, skipna=skipna, **kwds)
129 except Exception:
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in nanmedian(values, axis, skipna)
407 # otherwise return a scalar value
--> 408 return _wrap_results(get_median(values) if notempty else np.nan, dtype)
409
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in get_median(x)
378 return np.nan
--> 379 return algos.median(_values_from_object(x[mask]))
380
pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()
TypeError: No matching signature found
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-25-fd20db6cd3b8> in <module>()
1 train_df.to_pickle('data/train_df_fastai')
----> 2 df, y, nas = proc_df(train_df, 'y')
/usr/local/lib/python3.6/site-packages/fastai/structured.py in proc_df(df, y_fld, skip_flds, ignore_flds, do_scale, na_dict, preproc_fn, max_n_cat, subset, mapper)
434
435 if na_dict is None: na_dict = {}
--> 436 for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
437 if do_scale: mapper = scale_vars(df, mapper)
438 for n,c in df.items(): numericalize(df, c, n, max_n_cat)
/usr/local/lib/python3.6/site-packages/fastai/structured.py in fix_missing(df, col, name, na_dict)
267 if pd.isnull(col).sum() or (name in na_dict):
268 df[name+'_na'] = pd.isnull(col)
--> 269 filler = na_dict[name] if name in na_dict else col.median()
270 df[name] = col.fillna(filler)
271 na_dict[name] = filler
/usr/local/lib/python3.6/site-packages/pandas/core/generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
7313 skipna=skipna)
7314 return self._reduce(f, name, axis=axis, skipna=skipna,
-> 7315 numeric_only=numeric_only)
7316
7317 return set_function_name(stat_func, name, cls)
/usr/local/lib/python3.6/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
2575 'numeric_only.'.format(name))
2576 with np.errstate(all='ignore'):
-> 2577 return op(delegate, skipna=skipna, **kwds)
2578
2579 return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in _f(*args, **kwargs)
75 try:
76 with np.errstate(invalid='ignore'):
---> 77 return f(*args, **kwargs)
78 except ValueError as e:
79 # we want to transform an object array
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in f(values, axis, skipna, **kwds)
129 except Exception:
130 try:
--> 131 result = alt(values, axis=axis, skipna=skipna, **kwds)
132 except ValueError as e:
133 # we want to transform an object array
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in nanmedian(values, axis, skipna)
406
407 # otherwise return a scalar value
--> 408 return _wrap_results(get_median(values) if notempty else np.nan, dtype)
409
410
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in get_median(x)
377 if not skipna and not mask.all():
378 return np.nan
--> 379 return algos.median(_values_from_object(x[mask]))
380
381 if not is_float_dtype(values):
pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()
TypeError: No matching signature found
I am using this code to keep the dtypes tidy:
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
In my effort to try and solve the issue I have tried this with no luck:
for column in train_df:
if str(train_df[column].dtype)[:3] == 'int':
train_df[column] = train_df[column].astype(float)
I have also tried running train_cats()
even though it should not be necessary in my case.
Anyone has ideas on how to solve this?