AssertionError: nan values in `P_2` but not in setup training set

mcc_117 · June 5, 2022, 4:15am

Trying to apply what i learned in tabular data lesson for American Express - Default Prediction kaggle competition.

to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits,reduce_memory=True)

returns this error

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
/tmp/ipykernel_222/4108166279.py in <module>
----> 1 to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits,reduce_memory=True)

/opt/conda/lib/python3.7/site-packages/fastai/tabular/core.py in __init__(self, df, procs, cat_names, cont_names, y_names, y_block, splits, do_setup, device, inplace, reduce_memory)
    164         self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
    165         self.split = len(df) if splits is None else len(splits[0])
--> 166         if do_setup: self.setup()
    167 
    168     def new(self, df, inplace=False):

/opt/conda/lib/python3.7/site-packages/fastai/tabular/core.py in setup(self)
    175     def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    176     def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
--> 177     def setup(self): self.procs.setup(self)
    178     def process(self): self.procs(self)
    179     def loc(self): return self.items.loc

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in setup(self, items, train_setup)
    190         tfms = self.fs[:]
    191         self.fs.clear()
--> 192         for t in tfms: self.add(t,items, train_setup)
    193 
    194     def add(self,ts, items=None, train_setup=False):

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in add(self, ts, items, train_setup)
    194     def add(self,ts, items=None, train_setup=False):
    195         if not is_listy(ts): ts=[ts]
--> 196         for t in ts: t.setup(items, train_setup)
    197         self.fs+=ts
    198         self.fs = self.fs.sorted(key='order')

/opt/conda/lib/python3.7/site-packages/fastai/tabular/core.py in setup(self, items, train_setup)
    222         super().setup(getattr(items,'train',items), train_setup=False)
    223         # Procs are called as soon as data is available
--> 224         return self(items.items if isinstance(items,Datasets) else items)
    225 
    226     @property

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     71     @property
     72     def name(self): return getattr(self, '_name', _get_name(self))
---> 73     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     74     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     75     def __repr__(self): return f'{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}'

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     97     "A `Transform` that modifies in-place and just returns whatever it's passed"
     98     def _call(self, fn, x, split_idx=None, **kwargs):
---> 99         super()._call(fn,x,split_idx,**kwargs)
    100         return x
    101 

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     81     def _call(self, fn, x, split_idx=None, **kwargs):
     82         if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 83         return self._do_call(getattr(self, fn), x, **kwargs)
     84 
     85     def _do_call(self, f, x, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     87             if f is None: return x
     88             ret = f.returns(x) if hasattr(f,'returns') else None
---> 89             return retain_type(f(x, **kwargs), x, ret)
     90         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     91         return retain_type(res, x)

/opt/conda/lib/python3.7/site-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
    121         elif self.inst is not None: f = MethodType(f, self.inst)
    122         elif self.owner is not None: f = MethodType(f, self.owner)
--> 123         return f(*args, **kwargs)
    124 
    125     def __get__(self, inst, owner):

/opt/conda/lib/python3.7/site-packages/fastai/tabular/core.py in encodes(self, to)
    306         missing = pd.isnull(to.conts)
    307         for n in missing.any()[missing.any()].keys():
--> 308             assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
    309         for n in self.na_dict.keys():
    310             to[n].fillna(self.na_dict[n], inplace=True)

AssertionError: nan values in `P_2` but not in setup training set

I believe i have accounted for nan values here.

# Making use of the GPU library. This only works for integer only features at present.
def read_file_int(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_feather(path, columns=usecols)
    else: df = cudf.read_feather(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
#   df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime(df.S_2)
    # CREATE OVERALL ROW MISS VALUE
    features = [x for x in df.columns.values if x not in ['customer_ID', 'target']]
    df['n_missing'] = df[features].isna().sum(axis=1)
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    # KEEP ONLY FINAL CUSTOMER ID UNTIL FUTURE TIME SERIES WORK BEGINS
    df_out = df.groupby(['customer_ID']).tail(1).reset_index(drop=True)
    print('shape of data:', df_out.shape)
    del df
    return df_out

# To ensure that the categorical features are imported only using CPU
def read_file_cpu(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_feather(path, columns=usecols)
    else: df = pd.read_feather(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
#   df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = pd.to_datetime(df.S_2)
    # CREATE OVERALL ROW MISS VALUE
    #features = [x for x in df.columns.values if x not in ['customer_ID', 'target']]
    #df['n_missing'] = df[features].isna().sum(axis=1)
    # FILL NAN
    #features_num = [x for x in df._get_numeric_data().columns.values if x not in ['customer_ID', 'target']]
    #df = df[features_num].fillna(NAN_VALUE) 
    # KEEP ONLY FINAL CUSTOMER ID UNTIL FUTURE TIME SERIES WORK BEGINS
    df_out = df.groupby(['customer_ID']).tail(1).reset_index(drop=True)
    print('shape of data:', df_out.shape)
    del df
    return df_out

print('Reading train data...')
TRAIN_PATH = '../input/amexfeather/train_data.ftr'
train_df = read_file_cpu(path = TRAIN_PATH)

print('Reading test data...')
TEST_PATH = '../input/amexfeather/test_data.ftr'
test_df = read_file_cpu(path = TEST_PATH)

ElisonSherton · June 5, 2022, 6:47am

Hi @mcc_117

I looked at your code for filling na values; one question is NAN_VALUE some kind of global variable, if so what’s it’s value?

Also, could you just print out the summary of nan values once you read the df just to be sure?

print('Reading train data...')
TRAIN_PATH = '../input/amexfeather/train_data.ftr'
train_df = read_file_cpu(path = TRAIN_PATH)
print(f"Train NA Values\n{train_df.isnull().sum()}")

print('Reading test data...')
TEST_PATH = '../input/amexfeather/test_data.ftr'
test_df = read_file_cpu(path = TEST_PATH)
print(f"Test NA Values\n{test_df.isnull().sum()}")

Also, when you create the tabular pandas object, you would want to use the name train_df I think going by the variable names you’ve chosen to load the data

to = TabularPandas(train_df, procs, cat, cont, y_names=dep_var, splits=splits,reduce_memory=True)

Thanks,
Vinayak.

mcc_117 · June 5, 2022, 7:08am

NAN_VALUE = -99

I am running this line before tabularpandas code.
df = add_datepart(train_df, ‘S_2’)

Here is my notebook.
https://www.kaggle.com/code/mcc117/american-express-default-prediction/edit

ElisonSherton · June 5, 2022, 8:35am

Hey @mcc_117

Thanks for running this piece of code and sharing the output,

We can see that there are still some missing values in columns P_2, D_143, D_145 etc. I think you’ve accidentally commented out the code which fills in the NA values in read_file_cpu function which might be causing this issue.

# To ensure that the categorical features are imported only using CPU
def read_file_cpu(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_feather(path, columns=usecols)
    else: df = pd.read_feather(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
#   df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = pd.to_datetime(df.S_2)
    # CREATE OVERALL ROW MISS VALUE
    #features = [x for x in df.columns.values if x not in ['customer_ID', 'target']]
    #df['n_missing'] = df[features].isna().sum(axis=1)
    # FILL NAN

    # UNCOMMENT THE FOLLOWING LINE OF CODE PLEASE 

    #features_num = [x for x in df._get_numeric_data().columns.values if x not in ['customer_ID', 'target']]
    #df = df[features_num].fillna(NAN_VALUE) 
    # KEEP ONLY FINAL CUSTOMER ID UNTIL FUTURE TIME SERIES WORK BEGINS
    df_out = df.groupby(['customer_ID']).tail(1).reset_index(drop=True)
    print('shape of data:', df_out.shape)
    del df
    return df_out

Also, sorry I cannot access your notebook maybe because it’s a private kernel.

Thanks,
Vinayak.

mcc_117 · June 5, 2022, 9:11am

when I remove the comment i get this error.
KeyError: ‘customer_ID’
Also now you should be able to access my notebook.
KeyError: ‘customer_ID’

ElisonSherton · June 5, 2022, 12:50pm

Hi @mcc_117

This is happening because when we’re doing fill_na, we’re subsetting into the columnspace of the df and removing the customer_ID and target fields.

# To ensure that the categorical features are imported only using CPU
def read_file_cpu(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_feather(path, columns=usecols)
    else: df = pd.read_feather(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
#   df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = pd.to_datetime(df.S_2)
    # CREATE OVERALL ROW MISS VALUE
    features = [x for x in df.columns.values if x not in ['customer_ID', 'target']]
    df['n_missing'] = df[features].isna().sum(axis=1)
    # FILL NAN
     
    features_num = [x for x in df._get_numeric_data().columns.values if x not in ['customer_ID', 'target']]
    df[features_num].fillna(NAN_VALUE, inplace = True) 
    
    # KEEP ONLY FINAL CUSTOMER ID UNTIL FUTURE TIME SERIES WORK BEGINS
    df_out = df.groupby(['customer_ID']).tail(1).reset_index(drop=True)
    print('shape of data:', df_out.shape)
    del df
    return df_out

Also, I don’t know what the line

df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')

is supposed to do as I don’t have info on what’s the content of customer_ID but this KeyError should certainly get resolved.

Thanks,
Vinayak.

mcc_117 · June 6, 2022, 3:15am

This is happening because when we’re doing fill_na , we’re subsetting into the columnspace of the df and removing the customer_ID and target fields.
What is the update you did to the code in order for it to work.

ElisonSherton · June 6, 2022, 5:57am

I have modified this. Earlier we were overwriting df by the return value of the above call. Now, I have just replaced the values inplace, that’s it.

Hope this helps.

Thanks,
Vinayak.

mcc_117 · June 6, 2022, 6:10am

It now returns this error.

---------------------------------------------------------------------------
SettingWithCopyError                      Traceback (most recent call last)
/tmp/ipykernel_33/2571868665.py in <module>
     43 print('Reading train data...')
     44 TRAIN_PATH = '../input/amexfeather/train_data.ftr'
---> 45 train_df = read_file_cpu(path = TRAIN_PATH)
     46 
     47 print('Reading test data...')

/tmp/ipykernel_33/2571868665.py in read_file_cpu(path, usecols)
     33 
     34     features_num = [x for x in df._get_numeric_data().columns.values if x not in ['customer_ID', 'target']]
---> 35     df[features_num].fillna(NAN_VALUE, inplace = True)
     36 
     37     # KEEP ONLY FINAL CUSTOMER ID UNTIL FUTURE TIME SERIES WORK BEGINS

/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in fillna(self, value, method, axis, inplace, limit, downcast)
   5180             inplace=inplace,
   5181             limit=limit,
-> 5182             downcast=downcast,
   5183         )
   5184 

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
   6390         result = self._constructor(new_data)
   6391         if inplace:
-> 6392             return self._update_inplace(result)
   6393         else:
   6394             return result.__finalize__(self, method="fillna")

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _update_inplace(self, result, verify_is_copy)
   4237         self._clear_item_cache()
   4238         self._mgr = result._mgr
-> 4239         self._maybe_update_cacher(verify_is_copy=verify_is_copy)
   4240 
   4241     @final

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _maybe_update_cacher(self, clear, verify_is_copy)
   3508 
   3509         if verify_is_copy:
-> 3510             self._check_setitem_copy(stacklevel=5, t="referent")
   3511 
   3512         if clear:

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _check_setitem_copy(self, stacklevel, t, force)
   3931 
   3932         if value == "raise":
-> 3933             raise com.SettingWithCopyError(t)
   3934         elif value == "warn":
   3935             warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel)

SettingWithCopyError: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

ElisonSherton · June 6, 2022, 8:19am

Umm. alright could you try this

df[features_num] = df[features_num].fillna(NAN_VALUE)

Thanks,
Vinayak.

mcc_117 · June 6, 2022, 8:35am

Ranned 3 to 4 times and its not a error but this.
Your notebook tried to allocate more memory than is available. It has restarted.

I have also started a another notebook and tried something else and it works till
df = add_datepart(train, ‘S_2’) line.
I have also posted that question here.

https://forums.fast.ai/t/attributeerror-datetimeproperties-object-has-no-attribute-week/96781
Maybe you can find a solution for.THanks for the help by the way.

ab_ai · July 24, 2022, 5:00am

Any chance this is actually happening with the Test dataset and not your training data? I’ve had this issue come up before when the training data does not have missing data but my test data does. The FillMissing proc actually fills in some data for missing values but adds a new categorical column and sets the value of the new column of that row to indicate the value was missing.

Possibly this might be introduced by your procs:

procs = [Categorify, FillMissing]

bilalUWE · December 12, 2022, 7:23pm

I’m facing the same issue as the training data has no nulls whereas the test data has null values in several places. The following error is shown below:

My procs used for training the model is:

Is there a workaround for resolving this issue (at the inference stage) without incorporating nulls in the training data?

Many thanks and

Kind regards,
Bilal