Proc_df throws "No matching signature found"

jkdk · June 10, 2018, 7:54pm

Hi all,

I have a dataframe with 306487 rows by 658 features. The last column y is the target variable to predict.

You may download this file, unpack it and then run the following to produce my error:

train_df = pd.read_pickle('data/train_df_fastai')
df, y, nas = proc_df(train_df, 'y')

The proc_df() produces this error that I am unable to resolve.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in f(values, axis, skipna, **kwds)
    127                 else:
--> 128                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    129             except Exception:

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in nanmedian(values, axis, skipna)
    407     # otherwise return a scalar value
--> 408     return _wrap_results(get_median(values) if notempty else np.nan, dtype)
    409 

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in get_median(x)
    378             return np.nan
--> 379         return algos.median(_values_from_object(x[mask]))
    380 

pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()

TypeError: No matching signature found

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-25-fd20db6cd3b8> in <module>()
      1 train_df.to_pickle('data/train_df_fastai')
----> 2 df, y, nas = proc_df(train_df, 'y')

/usr/local/lib/python3.6/site-packages/fastai/structured.py in proc_df(df, y_fld, skip_flds, ignore_flds, do_scale, na_dict, preproc_fn, max_n_cat, subset, mapper)
    434 
    435     if na_dict is None: na_dict = {}
--> 436     for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    437     if do_scale: mapper = scale_vars(df, mapper)
    438     for n,c in df.items(): numericalize(df, c, n, max_n_cat)

/usr/local/lib/python3.6/site-packages/fastai/structured.py in fix_missing(df, col, name, na_dict)
    267         if pd.isnull(col).sum() or (name in na_dict):
    268             df[name+'_na'] = pd.isnull(col)
--> 269             filler = na_dict[name] if name in na_dict else col.median()
    270             df[name] = col.fillna(filler)
    271             na_dict[name] = filler

/usr/local/lib/python3.6/site-packages/pandas/core/generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
   7313                                       skipna=skipna)
   7314         return self._reduce(f, name, axis=axis, skipna=skipna,
-> 7315                             numeric_only=numeric_only)
   7316 
   7317     return set_function_name(stat_func, name, cls)

/usr/local/lib/python3.6/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   2575                                           'numeric_only.'.format(name))
   2576             with np.errstate(all='ignore'):
-> 2577                 return op(delegate, skipna=skipna, **kwds)
   2578 
   2579         return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in _f(*args, **kwargs)
     75             try:
     76                 with np.errstate(invalid='ignore'):
---> 77                     return f(*args, **kwargs)
     78             except ValueError as e:
     79                 # we want to transform an object array

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in f(values, axis, skipna, **kwds)
    129             except Exception:
    130                 try:
--> 131                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    132                 except ValueError as e:
    133                     # we want to transform an object array

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in nanmedian(values, axis, skipna)
    406 
    407     # otherwise return a scalar value
--> 408     return _wrap_results(get_median(values) if notempty else np.nan, dtype)
    409 
    410 

/usr/local/lib/python3.6/site-packages/pandas/core/nanops.py in get_median(x)
    377         if not skipna and not mask.all():
    378             return np.nan
--> 379         return algos.median(_values_from_object(x[mask]))
    380 
    381     if not is_float_dtype(values):

pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()

TypeError: No matching signature found

I am using this code to keep the dtypes tidy:

for col in df.columns:
    col_type = df[col].dtype
    
    if col_type != object:
        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    else:
        df[col] = df[col].astype('category')

In my effort to try and solve the issue I have tried this with no luck:

for column in train_df:
    if str(train_df[column].dtype)[:3] == 'int':
        train_df[column] = train_df[column].astype(float)

I have also tried running train_cats() even though it should not be necessary in my case.

Anyone has ideas on how to solve this?