hi ,
I used the trick you suggested to distribute the data and pass it to the TextLMDataBunch which seems to work fine , but when I try to use the classifier it fails (TextClasDataBunch , error pasted below)
-
My labels are email id , based on text I want to know which ticket is assigned to which email id. I have removed the @,.,%,-,* everything.
-
Also I have checked the data frame there is no null in any of the columns.
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(ml_df, test_size=0.2)
data_lm = TextLMDataBunch.from_df(".",train_df=train_df,valid_df=valid_df,text_cols=‘text’, label_cols=‘label’,bs=16)
valid_df.head()
label text is_valid
sosachromiumorg Provider stronger correctness guarantees for p… True
cmpchromiumorg redirect can cause server to flood itself False
mswchromiumorg REGRESSION: line hight of marked text in omnib… True
data_clas = TextClasDataBunch.from_df(".", train_df=train_df,valid_df=valid_df, vocab=data_lm.train_ds.vocab, text_cols='text', label_cols='label',bs=16)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-178-def9fef8cf8a> in <module>
1 #data_clas = TextClasDataBunch.from_df(".",train_df=train_df,valid_df=valid_df,vocab=data_lm.train_ds.vocab,bs=16)
2
----> 3 data_clas = TextClasDataBunch.from_df(".", train_df=train_df,valid_df=valid_df, vocab=data_lm.train_ds.vocab, text_cols='text', label_cols='label',bs=16)
D:\ML\Anaconda\lib\site-packages\fastai\text\data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, chunksize, max_vocab, min_freq, mark_fields, include_bos, include_eos, **kwargs)
202 else:
203 if label_delim is not None: src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
--> 204 else: src = src.label_from_df(cols=label_cols, classes=classes)
205 if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
206 return src.databunch(**kwargs)
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in _inner(*args, **kwargs)
475 self.valid = fv(*args, from_item_lists=True, **kwargs)
476 self.__class__ = LabelLists
--> 477 self.process()
478 return self
479 return _inner
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self)
529 "Process the inner datasets."
530 xp,yp = self.get_processors()
--> 531 for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
532 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
533 for ds in self.lists:
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, xp, yp, name)
694 def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
695 "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 696 self.y.process(yp)
697 if getattr(self.y, 'filter_missing_y', False):
698 filt = array([o is None for o in self.y.items])
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, processor)
81 if processor is not None: self.processor = processor
82 self.processor = listify(self.processor)
---> 83 for p in self.processor: p.process(self)
84 return self
85
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
346 ds.classes = self.classes
347 ds.c2i = self.c2i
--> 348 super().process(ds)
349
350 def __getstate__(self): return {n:getattr(self,n) for n in self.state_attrs}
D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
50 def __init__(self, ds:Collection=None): self.ref_ds = ds
51 def process_one(self, item:Any): return item
---> 52 def process(self, ds:Collection): ds.items = array([self.process_one(item) for item in ds.items])
53
54 PreProcessors = Union[PreProcessor, Collection[PreProcessor]]
D:\ML\Anaconda\lib\site-packages\fastai\core.py in array(a, dtype, **kwargs)
281 if np.int_==np.int32 and dtype is None and is_listy(a) and len(a) and isinstance(a[0],int):
282 dtype=np.int64
--> 283 return np.array(a, dtype=dtype, **kwargs)
284
285 class EmptyLabel(ItemBase):
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'