So I’ve been trying to implement the SMOTE method by running
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=23, ratio = ‘minority’, n_jobs=-1)
%time X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
#Convert back to dataframe to get dataloader to work
X_trn_res = pd.DataFrame(data=X_train_res,columns=df.columns)
y_trn_res = pd.DataFrame(data=y_train_res,columns=[‘TARGET’]
y_valid = pd.DataFrame(data=y_valid,columns=[‘TARGET’])
My code works all the way through training before attempting to implement the SMOTE code.
After inserting the code above, I have begun to get a keyerror in the lr_find() and nothing else will run.
It feels like it has to be a simple issue of converting back from the numpy array output by the SMOTE code back into dataframes so i can use my existing code, but for the life of me I can’t figure out what’s wrong. Does anyone have experience with the imblearn package or smote and getting it to place nice with fast.ai ?
Please forgive the huge error message dump but after a second day debugging with no progress I’m flailing for help a little bit here. There must be a way the dataloader is handling the indexes in a way that gets destroyed when being upsampled in the SMOTE algo and output as a np array and then reconstituted as a dataframe. At this point, I’m just going to keep things as arrays and convert the remaining dataframes to arrays and use the ColumnarModelData.from_arrays() method.
md2 = ColumnarModelData.from_data_frames('', trn_df = X_trn_res, val_df = X_valid, trn_y = y_trn_res.astype('int'),
val_y = y_valid.astype('int'), cat_flds=cat_vars, bs=256, is_reg= False,is_multi=False)
m2 = MixedInputModel(emb_szs, n_cont = len(df.columns)-len(cat_vars), emb_drop = 0.1, out_sz = 2,
szs = [1000, 800, 600, 400, 200], drops = [0.5, 0.4, 0.3, 0.2, 0.1],y_range = None,
use_bn = False, is_reg = False, is_multi = False)
bm2 = BasicModel(m2.cuda(), 'binary_classifier')
learn2 = StructuredLearner(md2, bm2)
learn2.lr_find()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\Lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 111286
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-63-19b58a336fcf> in <module>()
----> 1 learn2.lr_find()
~\fastai\courses\dl1\structured\fastai\learner.py in lr_find(self, start_lr, end_lr, wds, linear, **kwargs)
328 layer_opt = self.get_layer_opt(start_lr, wds)
329 self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)
--> 330 self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)
331 self.load('tmp')
332
~\fastai\courses\dl1\structured\fastai\learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
232 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
233 swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234 swa_eval_freq=swa_eval_freq, **kwargs)
235
236 def get_layer_groups(self): return self.models.get_layer_groups()
~\fastai\courses\dl1\structured\fastai\model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
135 if all_val: val_iter = IterBatch(cur_data.val_dl)
136
--> 137 for (*x,y) in t:
138 batch_num += 1
139 for cb in callbacks: cb.on_batch_begin()
~\Anaconda3\Lib\site-packages\tqdm\_tqdm.py in __iter__(self)
928 """, fp_write=getattr(self.fp, 'write', sys.stderr.write))
929
--> 930 for obj in iterable:
931 yield obj
932 # Update and possibly print the progressbar.
~\fastai\courses\dl1\structured\fastai\dataloader.py in __iter__(self)
86 # avoid py3.6 issue where queue is infinite and can result in memory exhaustion
87 for c in chunk_iter(iter(self.batch_sampler), self.num_workers*10):
---> 88 for batch in e.map(self.get_batch, c):
89 yield get_tensor(batch, self.pin_memory, self.half)
90
~\Anaconda3\envs\fastai\lib\concurrent\futures\_base.py in result_iterator()
584 # Careful not to keep a reference to the popped future
585 if timeout is None:
--> 586 yield fs.pop().result()
587 else:
588 yield fs.pop().result(end_time - time.time())
~\Anaconda3\envs\fastai\lib\concurrent\futures\_base.py in result(self, timeout)
423 raise CancelledError()
424 elif self._state == FINISHED:
--> 425 return self.__get_result()
426
427 self._condition.wait(timeout)
~\Anaconda3\envs\fastai\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
~\Anaconda3\envs\fastai\lib\concurrent\futures\thread.py in run(self)
54
55 try:
---> 56 result = self.fn(*self.args, **self.kwargs)
57 except BaseException as exc:
58 self.future.set_exception(exc)
~\fastai\courses\dl1\structured\fastai\dataloader.py in get_batch(self, indices)
73
74 def get_batch(self, indices):
---> 75 res = self.np_collate([self.dataset[i] for i in indices])
76 if self.transpose: res[0] = res[0].T
77 if self.transpose_y: res[1] = res[1].T
~\fastai\courses\dl1\structured\fastai\dataloader.py in <listcomp>(.0)
73
74 def get_batch(self, indices):
---> 75 res = self.np_collate([self.dataset[i] for i in indices])
76 if self.transpose: res[0] = res[0].T
77 if self.transpose_y: res[1] = res[1].T
~\fastai\courses\dl1\structured\fastai\column_data.py in __getitem__(self, idx)
35
36 def __getitem__(self, idx):
---> 37 return [self.cats[idx], self.conts[idx], self.y[idx]]
38
39 @classmethod
~\Anaconda3\Lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\Anaconda3\Lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\Anaconda3\Lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
~\Anaconda3\Lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
~\Anaconda3\Lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 111286