Textlist from df

Hello everyone! I’m trying to build a text list from a data frame, with one column being the text, and the other column being a rating of 1-5.

This was my attempt:
data_clas = (TextList.from_df(df, path, cols=1,vocab=data_lm.vocab)
.random_split_by_pct(0.1)
.databunch(bs=bs))

however I get the following error:
Your data isn’t labeled, can’t turn it in a DataBunch yet!

Which I agree, it isn’t labeled, but I don’t see a parameter to set the labels to a column. I tried “label_cols=2” but it threw an error saying label_cols is unknown.

If its in the documentation, I’ve looked for quite a while and I sure don’t see it, so my apologies if I missed it.

Any thoughts? Thanks in advance!

What is the name of the column? You should be able to pass that into cols. If you do
TextClasDatabunch.from_df() you can pass in which one is the text and which one is the label. eg text column is ‘content’ and label is ‘type’

data_clas = TextClasDataBunch.from_df(path, df, vocab=data_lm.train_ds.vocab, text_cols = 'content', label_cols = 'type',bs=16)

if you want to use the number for each column, do it for cols and labels

2 Likes

Thanks for the reponse!

So I’m putting in the following:
data_clas = TextClasDataBunch.from_df(path, train_df=df[:-1000], valid_df=df[-1000:], vocab=data_lm.vocab, text_cols = ‘text’, label_cols = ‘review’,bs=48).no_split().databunch(bs=bs)

and getting:
Your data isn’t labeled, can’t turn it in a DataBunch yet!

But shouldn’t label_cols have labeled it for me?

You shouldn’t need .no_split() or .databunch() as the original already makes it :slight_smile: What does doing that output?

works perfectly, thanks!

No problem! Glad I could help :slight_smile:

hi ,
I used the trick you suggested to distribute the data and pass it to the TextLMDataBunch which seems to work fine , but when I try to use the classifier it fails (TextClasDataBunch , error pasted below)

  1. My labels are email id , based on text I want to know which ticket is assigned to which email id. I have removed the @,.,%,-,* everything.

  2. Also I have checked the data frame there is no null in any of the columns.

    from sklearn.model_selection import train_test_split
    train_df, valid_df = train_test_split(ml_df, test_size=0.2)
    data_lm = TextLMDataBunch.from_df(".",train_df=train_df,valid_df=valid_df,text_cols=‘text’, label_cols=‘label’,bs=16)

    valid_df.head()

     label	text	is_valid
    

    sosachromiumorg Provider stronger correctness guarantees for p… True
    cmpchromiumorg redirect can cause server to flood itself False
    mswchromiumorg REGRESSION: line hight of marked text in omnib… True

data_clas = TextClasDataBunch.from_df(".", train_df=train_df,valid_df=valid_df, vocab=data_lm.train_ds.vocab, text_cols='text', label_cols='label',bs=16)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-178-def9fef8cf8a> in <module>
  1 #data_clas = TextClasDataBunch.from_df(".",train_df=train_df,valid_df=valid_df,vocab=data_lm.train_ds.vocab,bs=16)
  2 
----> 3 data_clas = TextClasDataBunch.from_df(".", train_df=train_df,valid_df=valid_df, vocab=data_lm.train_ds.vocab, text_cols='text', label_cols='label',bs=16)

D:\ML\Anaconda\lib\site-packages\fastai\text\data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, chunksize, max_vocab, min_freq, mark_fields, include_bos, include_eos, **kwargs)
202         else:
203             if label_delim is not None: src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
--> 204             else: src = src.label_from_df(cols=label_cols, classes=classes)
205         if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
206         return src.databunch(**kwargs)

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in _inner(*args, **kwargs)
475             self.valid = fv(*args, from_item_lists=True, **kwargs)
476             self.__class__ = LabelLists
--> 477             self.process()
478             return self
479         return _inner

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self)
529         "Process the inner datasets."
530         xp,yp = self.get_processors()
--> 531         for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
532         #progress_bar clear the outputs so in some case warnings issued during processing disappear.
533         for ds in self.lists:

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, xp, yp, name)
694     def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
695         "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 696         self.y.process(yp)
697         if getattr(self.y, 'filter_missing_y', False):
698             filt = array([o is None for o in self.y.items])

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, processor)
 81         if processor is not None: self.processor = processor
 82         self.processor = listify(self.processor)
---> 83         for p in self.processor: p.process(self)
 84         return self
 85 

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
346         ds.classes = self.classes
347         ds.c2i = self.c2i
--> 348         super().process(ds)
349 
350     def __getstate__(self): return {n:getattr(self,n) for n in self.state_attrs}

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
 50     def __init__(self, ds:Collection=None):  self.ref_ds = ds
 51     def process_one(self, item:Any):         return item
---> 52     def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])
 53 
 54 PreProcessors = Union[PreProcessor, Collection[PreProcessor]]

D:\ML\Anaconda\lib\site-packages\fastai\core.py in array(a, dtype, **kwargs)
281     if np.int_==np.int32 and dtype is None and is_listy(a) and len(a) and isinstance(a[0],int):
282         dtype=np.int64
--> 283     return np.array(a, dtype=dtype, **kwargs)
284 
285 class EmptyLabel(ItemBase):

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

Your initial problem comes from your use of the datablock API. In order to create a Databunch with the datablock api, you must:

  • Create your data (TextList.from_df)
  • Create your validation set (.random_split_by_pct(0.1))
  • Add labels
  • Transform into databunch (.databunch(bs=bs))

In your case, you’re missing the Add labels step. So instead of

TextList.from_df(df, path, cols=1,vocab=data_lm.vocab)
.random_split_by_pct(0.1)
.databunch(bs=bs)

you should do something along the line of

TextList.from_df(df, path, cols=1,vocab=data_lm.vocab)
.random_split_by_pct(0.1)
.label_from_df(cols='type')
.databunch(bs=bs)

The information can be found here : https://docs.fast.ai/data_block.html#ItemList.label_from_df

While the default TextDataBunch creator works well here, being able to use the Data Block API is super useful when the default doesn’t work.

1 Like

Thanks @StatisticDean.

The api behaviour is strange , with random_split_by_pct(0.1) it gives the same error. I see a new one created to split the data split_by_rand_pct.

It works like below:
data_clas=(TextList.from_df(train_df, “.”, cols=‘text’,vocab=data_lm.train_ds.vocab)
.random_split_by_pct(0)
.label_from_df(cols=‘label’)
.databunch(bs=16))

Could you be more specific with the error you’re getting? I suggest you separate each part of the databunch creation to see which step is creating the problem. Something like :

data_list = TextList.from_df(...)
split_data = data_list.random_split_by_pct(...)
labeled_data = split_data.label_drom_df(...)
data = labeled_data.databunch(...)

And then try to investigate the error message, and if you don’t succeed, paste it here.

1 Like

hi
I was able to create the language model , but it is failing to create a databunch in the classifier [same as before].

my Data is in .csv with three columns : Label,Text , is_valid
Label - Email id
Text : issues description
is_valid = True or False

@StatisticDean : I tried the same as you have mentioned ,

Langauge -

learn.fit_one_cycle(3,1e-2, moms=(0.8,0.7))

epoch train_loss valid_loss accuracy time
0 4.176631 3.534079 0.433560 1:20:58
1 3.456771 3.049165 0.505931 1:20:30
2 2.806027 2.936237 0.523615 1:19:11

learn.fit_one_cycle(5,1e-2, moms=(0.8,0.7))

epoch train_loss valid_loss accuracy time
0 2.819904 3.091885 0.500018 1:20:58
1 3.097184 3.183694 0.495549 1:23:07
2 2.815591 3.065641 0.519857 1:24:41
3 2.404943 3.022329 0.535245 1:23:50
4 2.085701 3.043083 0.536762 1:22:18
learn.save('fit_head_2')
learn.save_encoder('fine_tuned_enc')
learn.predict('Become a fan', 100 , temperature=1.1, min_p=0.001)

‘Become a fan dependent build in VM Go a thread dependent build in VM xxbos shill Cellular : Cellular API should display a device property policy shill : Cellular API should display a device policy property xxbos Update ibus - mozc - chewing to 1.6.6 - pinyin , leading to font - ordering Update ibus - mozc - chewing to 1.6.6 * , revision to 0.13.499.102 URL for ibus - memconf xxbos AU : provide support for AU Test with entd test AU : provide support for’

Classify -

data_list = TextList.from_csv(".",'csv_df.csv',vocab=data_lm.vocab,cols='text')
data_split = data_list.split_from_df(col='is_valid')
data_label = data_split.label_from_df(cols='label')

TypeError Traceback (most recent call last)
in
----> 1 data_label = data_split.label_from_df(cols=‘label’)

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in _inner(*args, **kwargs)
475 self.valid = fv(*args, from_item_lists=True, **kwargs)
476 self.class = LabelLists
–> 477 self.process()
478 return self
479 return _inner

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self)
529 “Process the inner datasets.”
530 xp,yp = self.get_processors()
–> 531 for ds,n in zip(self.lists, [‘train’,‘valid’,‘test’]): ds.process(xp, yp, name=n)
532 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
533 for ds in self.lists:

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, xp, yp, name)
694 def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
695 “Launch the processing on self.x and self.y with xp and yp.”
–> 696 self.y.process(yp)
697 if getattr(self.y, ‘filter_missing_y’, False):
698 filt = array([o is None for o in self.y.items])

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, processor)
81 if processor is not None: self.processor = processor
82 self.processor = listify(self.processor)
—> 83 for p in self.processor: p.process(self)
84 return self
85

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
346 ds.classes = self.classes
347 ds.c2i = self.c2i
–> 348 super().process(ds)
349
350 def getstate(self): return {n:getattr(self,n) for n in self.state_attrs}

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
50 def init(self, ds:Collection=None): self.ref_ds = ds
51 def process_one(self, item:Any): return item
—> 52 def process(self, ds:Collection): ds.items = array([self.process_one(item) for item in ds.items])
53
54 PreProcessors = Union[PreProcessor, Collection[PreProcessor]]

D:\ML\Anaconda\lib\site-packages\fastai\core.py in array(a, dtype, **kwargs)
281 if np.int_==np.int32 and dtype is None and is_listy(a) and len(a) and isinstance(a[0],int):
282 dtype=np.int64
–> 283 return np.array(a, dtype=dtype, **kwargs)
284
285 class EmptyLabel(ItemBase):

TypeError: int() argument must be a string, a bytes-like object or a number, not ‘NoneType’

Another [second] way I tried but I still don’t understand why it is failing : Saving the DF to csv

  1. I converted the DF to a csv file

csv_df.head()

label text is_valid
0 venkataramana@chromium.org Errors in importing from firefox Errors in imp… False
1 mal.chro...@gmail.com Scrolling with middle-mouse button does not wo… False
2 aa@chromium.org Wishlist: Chrome does not have an addon-system… False
3 cbentzel@chromium.org Automatic integrated windows authentication (a… False
4 eroman@chromium.org Facebook: Commenting on Status not working Fac… True
  1. Created a datalist from it -

data_list = TextList.from_df(csv_df, “.”, vocab=data_lm.vocab,cols=[‘label’,‘text’,‘is_valid’])
data_list.items

   array([['venkataramana@chromium.org', 'Errors in importing from firefox Errors in importing from firefox', 'False'],
       ['mal.chro...@gmail.com',
        'Scrolling with middle-mouse button does not work (autoscroll) Scrolling with middle-mouse button does not work (autoscroll)',
        'False'],
       ['aa@chromium.org',
        'Wishlist: Chrome does not have an addon-system Wishlist: Chrome does not have an addon-system', 'False'],
       ['cbentzel@chromium.org',
        'Automatic integrated windows authentication (aka automatic NTLM / Negotiate Auth support) Automatic integrated windows authentication (aka automatic NTLM / Negotiate Auth support)',
        'False'],
       ...,
       ['vsevik@chromium.org',
        'Not possible to specify sourcemaps generated inline Not possible to specify sourcemaps generated inline',
        'False'],
       ['venkatraman@chromium.org', '336644965.xlsx corrupt on roundtrip 336644965.xlsx corrupt on roundtrip', 'True'],
       ['junov@chromium.org',
        'CanvasRenderingContext2D.isPointInPath() incorrectly returns false for points on a line CanvasRenderingContext2D.isPointInPath() incorrectly returns false for points on a line',
        'False'],
       ['denniskempin@chromium.org', 'Replace tpcontrol/mousecontrol scripts Replace tpcontrol/mousecontrol scripts',
        'True']], dtype=object)

3.Split the data : This gives an empty array. Why so ?

data_split = data_list.split_from_df(col=‘is_valid’)
data_split.items
array([], shape=(0, 3), dtype=object)

This is the third type I tried, and it gives the same error : Seems it is not able to split the data by label .
@muellerzr @StatisticDean

data_clas = TextClasDataBunch.from_df(".", train_df=csv_df[:-1000], valid_df=csv_df[-1000:], vocab=data_lm.vocab, text_cols = ‘text’, label_cols = ‘label’,bs=16)

--------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-332-15f329a3e58f> in <module>
----> 1 data_clas = TextClasDataBunch.from_df(".", train_df=csv_df[:-1000], valid_df=csv_df[-1000:], vocab=data_lm.vocab, text_cols = 'text', label_cols = 'label',bs=16)

D:\ML\Anaconda\lib\site-packages\fastai\text\data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, chunksize, max_vocab, min_freq, mark_fields, include_bos, include_eos, **kwargs)
    202         else:
    203             if label_delim is not None: src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
--> 204             else: src = src.label_from_df(cols=label_cols, classes=classes)
    205         if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
    206         return src.databunch(**kwargs)

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in _inner(*args, **kwargs)
    475             self.valid = fv(*args, from_item_lists=True, **kwargs)
    476             self.__class__ = LabelLists
--> 477             self.process()
    478             return self
    479         return _inner

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self)
    529         "Process the inner datasets."
    530         xp,yp = self.get_processors()
--> 531         for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
    532         #progress_bar clear the outputs so in some case warnings issued during processing disappear.
    533         for ds in self.lists:

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, xp, yp, name)
    694     def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
    695         "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 696         self.y.process(yp)
    697         if getattr(self.y, 'filter_missing_y', False):
    698             filt = array([o is None for o in self.y.items])

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, processor)
     81         if processor is not None: self.processor = processor
     82         self.processor = listify(self.processor)
---> 83         for p in self.processor: p.process(self)
     84         return self
     85 

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
    346         ds.classes = self.classes
    347         ds.c2i = self.c2i
--> 348         super().process(ds)
    349 
    350     def __getstate__(self): return {n:getattr(self,n) for n in self.state_attrs}

D:\ML\Anaconda\lib\site-packages\fastai\data_block.py in process(self, ds)
     50     def __init__(self, ds:Collection=None):  self.ref_ds = ds
     51     def process_one(self, item:Any):         return item
---> 52     def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])
     53 
     54 PreProcessors = Union[PreProcessor, Collection[PreProcessor]]

D:\ML\Anaconda\lib\site-packages\fastai\core.py in array(a, dtype, **kwargs)
    281     if np.int_==np.int32 and dtype is None and is_listy(a) and len(a) and isinstance(a[0],int):
    282         dtype=np.int64
--> 283     return np.array(a, dtype=dtype, **kwargs)
    284 
    285 class EmptyLabel(ItemBase):

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

@jbo This is weird. I started looking into it but I havn’t found an explanation yet. When you do

data_split =  data_list.split_from_df(col=‘is_valid’)
data_split.items
array([], shape=(0, 3), dtype=object), 

The error seems to be coming from before. When you look at the data_list, you get the following error :

data_list = TextList.from_df(df, '.',cols='text')
data_list

AttributeError                            Traceback (most recent call last)
~/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    400                         if cls is not object \
    401                                 and callable(cls.__dict__.get('__repr__')):
--> 402                             return _repr_pprint(obj, self, cycle)
    403 
    404             return _default_pprint(obj, self, cycle)

~/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    695     """A pprint that just redirects to the normal repr function."""
    696     # Find newlines and replace them with p.break_()
--> 697     output = repr(obj)
    698     for idx,output_line in enumerate(output.splitlines()):
    699         if idx:

~/Documents/fastai/fastai/data_block.py in __repr__(self)
     74         return self.items[i]
     75     def __repr__(self)->str:
---> 76         items = [self[i] for i in range(min(5,len(self.items)))]
     77         return f'{self.__class__.__name__} ({len(self.items)} items)\n{show_some(items)}\nPath: {self.path}'
     78 

~/Documents/fastai/fastai/data_block.py in <listcomp>(.0)
     74         return self.items[i]
     75     def __repr__(self)->str:
---> 76         items = [self[i] for i in range(min(5,len(self.items)))]
     77         return f'{self.__class__.__name__} ({len(self.items)} items)\n{show_some(items)}\nPath: {self.path}'
     78 

~/Documents/fastai/fastai/data_block.py in __getitem__(self, idxs)
    116         "returns a single item based if `idxs` is an integer or a new `ItemList` object if `idxs` is a range."
    117         idxs = try_int(idxs)
--> 118         if isinstance(idxs, Integral): return self.get(idxs)
    119         else: return self.new(self.items[idxs], inner_df=index_row(self.inner_df, idxs))
    120 

~/Documents/fastai/fastai/text/data.py in get(self, i)
    329     def get(self, i):
    330         o = super().get(i)
--> 331         return Text(o, self.vocab.textify(o, self.sep))
    332 
    333     def label_for_lm(self, **kwargs):

AttributeError: 'NoneType' object has no attribute 'textify'

So I would suggest investigating here. I can’t give you more information at the moment. I’ll come back to you if I find some.

Edit : This error comes from not specifying the vocab here since it defaults to None. Still it is weird that this basic TextList init doesn’t display well.

The cols argument should only be the column containing the text. (Still doesn’t explain the error).

hi ,
I am trying to edit , my bad for the previous post , it was in a hurry.

In case the label column has unique email id’s will the split_from_df fail . Currently I am trying to remove all the email id’s which are repeated less than 20 times.

The data set is present in http://bugtriage.mybluemix.net/#chrome

with the dataset I have done the following to achieve a 52% accuracy of the language model

import json
import pandas as pd
from pprint import pprint
from fastai import *
from fastai.tabular import *
from fastai.text import *
%reload_ext autoreload
%autoreload 2
%matplotlib inline

#Load Json to DF
with open('deep_data.json') as f:
    data = json.load(f,strict=False)

deep_df = pd.DataFrame(data)
#deep_df.head
deep_df.columns

Index(['description', 'id', 'issue_id', 'issue_title', 'owner',
       'reported_time'],
      dtype='object')

deep_df["description"].replace(regex=['\r'], value=' ',inplace=True)
deep_df["description"].replace(regex=['\n'], value=' ',inplace=True)
deep_df["description"].replace(regex=[','], value=' ',inplace=True)

    def remove_non_ascii(text):
        return ''.join(i for i in text if ord(i)>32 and ord(i)<128)
     
    deep_df['description'] = deep_df['description'].apply(remove_non_ascii)

    deep_df[['owner']].count()

    owner    163695
    dtype: int64

deep_df = deep_df[deep_df['owner']!='']

import numpy as np
from numpy.random import choice
is_valid = ["True","False"]
probabilities = [0.2, 0.8]
deep_df["is_valid"]=np.random.choice(is_valid, p=probabilities, size=len(deep_df))

deep_df.shape
(45459, 7)

deep_df= add_datepart(deep_df,"reported_time",drop=False)

small_df = pd.DataFrame(columns=['label', 'text', 'is_valid'])

deep_df['Merge'] = deep_df['issue_title'] + " " + deep_df['issue_title']

deep_df.reset_index(drop=True,inplace=True)

csv_df = pd.DataFrame(columns=['label', 'text', 'is_valid'])

csv_df[['label','text','is_valid']]= deep_df[['owner', 'Merge', 'is_valid']]

csv_df.isnull().any()

label       False
text        False
is_valid    False
dtype: bool

csv_df.to_csv("csv_df.csv",na_rep="",index=False)

bs=16
data_lm = TextLMDataBunch.from_csv(".",'csv_df.csv')
learn = language_model_learner(data_lm, AWD_LSTM)

learn.fit_one_cycle(1,1e-2, moms=(0.8,0.7), wd=0.1)
epoch train_loss valid_loss accuracy time
0 5.477064 4.653409 0.250460 48:15
learn.unfreeze()
learn.fit_one_cycle(3,1e-2, moms=(0.8,0.7))
epoch train_loss valid_loss accuracy time
0 4.176631 3.534079 0.433560 1:20:58
1 3.456771 3.049165 0.505931 1:20:30
2 2.806027 2.936237 0.523615 1:19:11

learn.fit_one_cycle(5,1e-2, moms=(0.8,0.7))

epoch train_loss valid_loss accuracy time
0 2.819904 3.091885 0.500018 1:20:58
1 3.097184 3.183694 0.495549 1:23:07
2 2.815591 3.065641 0.519857 1:24:41
3 2.404943 3.022329 0.535245 1:23:50
4 2.085701 3.043083 0.536762 1:22:18

learn.save_encoder('fine_tuned_enc')

After this the problem starts with the classifier

Could you edit your post to format the code? Just select all the code and press Shift + Ctrl + C.

My bad, just edited the previous post.

1 Like

Maybe you could try using the split_by_idx function. Create a valid_idx that contains the index of the elements that should be in the valid set and pass it to this function. I can’t seem to get split_from_df to work.

In the example you gave me, the ‘is_valid’ column is not boolean but it is string, as a result, everything is put in the valid set ('False' is evaluated to True in a boolean context). This is the reason the split_from_df doesn’t give you the result you’re expecting.

1 Like