Proc_df memory error


#1

I am getting memory error when executing proc_df on New York City Taxi Fare Prediction training data, which is about 55M rows, 7 columns. Is there any way to make it work at the machine i am using at paperspace (spec below)? Or do i need to upgrade to machine with more memory?

Paperspace machine setup:
MACHINE TYPE: P4000 HOURLY
REGION: CA1
RAM: 30 GB
CPUS: 8
HD: 34.7 GB / 250 GB
GPU: 8 GB

code:

df, y, nas = proc_df(df_raw, 'fare_amount')

error details:

MemoryError                               Traceback (most recent call last)
~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexing.py in _getbool_axis(self, key, axis)
   1495         try:
-> 1496             return self.obj._take(inds, axis=axis)
   1497         except Exception as detail:

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in _take(self, indices, axis, is_copy)
   2784     def _take(self, indices, axis=0, is_copy=True):
-> 2785         self._consolidate_inplace()
   2786 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in _consolidate_inplace(self)
   4438 
-> 4439         self._protect_consolidate(f)
   4440 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in _protect_consolidate(self, f)
   4427         blocks_before = len(self._data.blocks)
-> 4428         result = f()
   4429         if len(self._data.blocks) != blocks_before:

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/generic.py in f()
   4436         def f():
-> 4437             self._data = self._data.consolidate()
   4438 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in consolidate(self)
   4097         bm._is_consolidated = False
-> 4098         bm._consolidate_inplace()
   4099         return bm

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in _consolidate_inplace(self)
   4102         if not self.is_consolidated():
-> 4103             self.blocks = tuple(_consolidate(self.blocks))
   4104             self._is_consolidated = True

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in _consolidate(blocks)
   5068         merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
-> 5069                                       _can_consolidate=_can_consolidate)
   5070         new_blocks = _extend_blocks(merged_blocks, new_blocks)

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/internals.py in _merge_blocks(blocks, dtype, _can_consolidate)
   5091         argsort = np.argsort(new_mgr_locs)
-> 5092         new_values = new_values[argsort]
   5093         new_mgr_locs = new_mgr_locs[argsort]

MemoryError: 

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-23-35a73d8cb330> in <module>()
----> 1 df, y, nas = proc_df(df_raw, 'fare_amount')

~/fastai/courses/ml1/fastai/structured.py in proc_df(df, y_fld, skip_flds, ignore_flds, do_scale, na_dict, preproc_fn, max_n_cat, subset, mapper)
    445     if do_scale: mapper = scale_vars(df, mapper)
    446     for n,c in df.items(): numericalize(df, c, n, max_n_cat)
--> 447     df = pd.get_dummies(df, dummy_na=True)
    448     df = pd.concat([ignored_flds, df], axis=1)
    449     res = [df, y, na_dict]

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype)
    840         if columns is None:
    841             data_to_encode = data.select_dtypes(
--> 842                 include=dtypes_to_encode)
    843         else:
    844             data_to_encode = data[columns]

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in select_dtypes(self, include, exclude)
   3089 
   3090         dtype_indexer = include_these & exclude_these
-> 3091         return self.loc[com._get_info_slice(self, dtype_indexer)]
   3092 
   3093     def _box_item_values(self, key, values):

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1470             except (KeyError, IndexError):
   1471                 pass
-> 1472             return self._getitem_tuple(key)
   1473         else:
   1474             # we by definition only have the 0th axis

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
    888                 continue
    889 
--> 890             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
    891 
    892         return retval

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1866             return self._get_slice_axis(key, axis=axis)
   1867         elif com.is_bool_indexer(key):
-> 1868             return self._getbool_axis(key, axis=axis)
   1869         elif is_list_like_indexer(key):
   1870 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/indexing.py in _getbool_axis(self, key, axis)
   1496             return self.obj._take(inds, axis=axis)
   1497         except Exception as detail:
-> 1498             raise self._exception(detail)
   1499 
   1500     def _get_slice_axis(self, slice_obj, axis=None):

KeyError: MemoryError()

(Miguel) #2

This may help:

chunksize = 10 ** 6
for chunk in pd.read_csv(filename, chunksize=chunksize):
    process(chunk)

Ref: https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas


#3

Thanks Miguel

So far i was able to execute proc_df succesfully by changing the dtype for some columns, this kernel helped a lot: https://www.kaggle.com/szelee/how-to-import-a-csv-file-of-55-million-rows/notebook

Although i can read 55M rows of data into single dataframe and execute proc_df, i am now getting memory error when executing RandomForestRegressor, so will need to use chunking as you suggested i think.