Label function

Hi,

I like to build a weed classifier using a number of images where the according label is listed in a csv file.
Hence I tried to write my own function for the get_y parameter of DataBlock().
The lesson 2 example uses parent_label. My function is:
def get_weed_label( o): return df[df[“Filename”].str.contains( o)].Species.iloc[0]

df is a data frame with the csv data. I tested the function and it returns the correct label for each image as a string; which is identical to what parent_label does.
However after calling weeds = Datablock() it gives the below error message/TypeError when calling :
dls = weeds.dataloaders( path).

Who can explain what the issue with my get_weed_label function is??

thanks a lot
Norbert

Error message:

TypeError Traceback (most recent call last)
/tmp/ipykernel_11948/27525593.py in
1 # path = Path( ‘weed_labels_in_directories’)
----> 2 dls = weeds.dataloaders( path)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
111
112 def dataloaders(self, source, path=’.’, verbose=False, **kwargs):
→ 113 dsets = self.datasets(source, verbose=verbose)
114 kwargs = {**self.dls_kwargs, **kwargs, ‘verbose’: verbose}
115 return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in datasets(self, source, verbose)
108 splits = (self.splitter or RandomSplitter())(items)
109 pv(f"{len(splits)} datasets of sizes {’,’.join([str(len(s)) for s in splits])}", verbose)
→ 110 return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
111
112 def dataloaders(self, source, path=’.’, verbose=False, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in init(self, items, tfms, tls, n_inp, dl_type, **kwargs)
332 def init(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
333 super().init(dl_type=dl_type)
→ 334 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
335 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
336

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in (.0)
332 def init(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
333 super().init(dl_type=dl_type)
→ 334 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
335 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
336

/opt/conda/lib/python3.7/site-packages/fastcore/foundation.py in call(cls, x, *args, **kwargs)
95 def call(cls, x=None, *args, **kwargs):
96 if not args and not kwargs and x is not None and isinstance(x,cls): return x
—> 97 return super().call(x, *args, **kwargs)
98
99 # Cell

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in init(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
255 if do_setup:
256 pv(f"Setting up {self.tfms}", verbose)
→ 257 self.setup(train_setup=train_setup)
258
259 def _new(self, items, split_idx=None, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in setup(self, train_setup)
274
275 def setup(self, train_setup=True):
→ 276 self.tfms.setup(self, train_setup)
277 if len(self) != 0:
278 x = super().getitem(0) if self.splits is None else super().getitem(self.splits[0])[0]

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in setup(self, items, train_setup)
190 tfms = self.fs[:]
191 self.fs.clear()
→ 192 for t in tfms: self.add(t,items, train_setup)
193
194 def add(self,ts, items=None, train_setup=False):

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in add(self, ts, items, train_setup)
194 def add(self,ts, items=None, train_setup=False):
195 if not is_listy(ts): ts=[ts]
→ 196 for t in ts: t.setup(items, train_setup)
197 self.fs+=ts
198 self.fs = self.fs.sorted(key=‘order’)

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in setup(self, items, train_setup)
77 def setup(self, items=None, train_setup=False):
78 train_setup = train_setup if self.train_setup is None else self.train_setup
—> 79 return self.setups(getattr(items, ‘train’, items) if train_setup else items)
80
81 def _call(self, fn, x, split_idx=None, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastcore/dispatch.py in call(self, *args, **kwargs)
121 elif self.inst is not None: f = MethodType(f, self.inst)
122 elif self.owner is not None: f = MethodType(f, self.owner)
→ 123 return f(*args, **kwargs)
124
125 def get(self, inst, owner):

/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in setups(self, dsets)
249
250 def setups(self, dsets):
→ 251 if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)
252 self.c = len(self.vocab)
253

/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in init(self, col, sort, add_na, strict)
225 if not hasattr(col,‘unique’): col = L(col, use_list=True)
226 # o==o is the generalized definition of non-NaN used by Pandas
→ 227 items = L(o for o in col.unique() if o==o)
228 if sort: items = items.sorted()
229 self.items = ‘#na#’ + items if add_na else items

/opt/conda/lib/python3.7/site-packages/fastcore/foundation.py in unique(self, sort, bidir, start)
161 def enumerate(self): return L(enumerate(self))
162 def renumerate(self): return L(renumerate(self))
→ 163 def unique(self, sort=False, bidir=False, start=None): return L(uniqueify(self, sort=sort, bidir=bidir, start=start))
164 def val2idx(self): return val2idx(self)
165 def cycle(self): return cycle(self)

/opt/conda/lib/python3.7/site-packages/fastcore/basics.py in uniqueify(x, sort, bidir, start)
661 def uniqueify(x, sort=False, bidir=False, start=None):
662 “Unique elements in x, optional sort, optional return reverse correspondence, optional prepend with elements.”
→ 663 res = list(dict.fromkeys(x))
664 if start is not None: res = listify(start)+res
665 if sort: res.sort()

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in (.0)
266 def _after_item(self, o): return self.tfms(o)
267 def repr(self): return f"{self.class.name}: {self.items}\ntfms - {self.tfms.fs}"
→ 268 def iter(self): return (self[i] for i in range(len(self)))
269 def show(self, o, **kwargs): return self.tfms.show(o, **kwargs)
270 def decode(self, o, **kwargs): return self.tfms.decode(o, **kwargs)

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in getitem(self, idx)
302 res = super().getitem(idx)
303 if self._after_item is None: return res
→ 304 return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)
305
306 # Cell

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in _after_item(self, o)
264 raise
265 def subset(self, i): return self._new(self._get(self.splits[i]), split_idx=i)
→ 266 def _after_item(self, o): return self.tfms(o)
267 def repr(self): return f"{self.class.name}: {self.items}\ntfms - {self.tfms.fs}"
268 def iter(self): return (self[i] for i in range(len(self)))

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in call(self, o)
198 self.fs = self.fs.sorted(key=‘order’)
199
→ 200 def call(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
201 def repr(self): return f"Pipeline: {’ → '.join([f.name for f in self.fs if f.name != ‘noop’])}"
202 def getitem(self,i): return self.fs[i]

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
148 for f in tfms:
149 if not is_enc: f = f.decode
→ 150 x = f(x, **kwargs)
151 return x
152

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in call(self, x, **kwargs)
71 @property
72 def name(self): return getattr(self, ‘_name’, _get_name(self))
—> 73 def call(self, x, **kwargs): return self._call(‘encodes’, x, **kwargs)
74 def decode (self, x, **kwargs): return self._call(‘decodes’, x, **kwargs)
75 def repr(self): return f’{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}’

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
81 def _call(self, fn, x, split_idx=None, **kwargs):
82 if split_idx!=self.split_idx and self.split_idx is not None: return x
—> 83 return self._do_call(getattr(self, fn), x, **kwargs)
84
85 def _do_call(self, f, x, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastcore/transform.py in do_call(self, f, x, **kwargs)
87 if f is None: return x
88 ret = f.returns(x) if hasattr(f,‘returns’) else None
—> 89 return retain_type(f(x, **kwargs), x, ret)
90 res = tuple(self.do_call(f, x, **kwargs) for x
in x)
91 return retain_type(res, x)

/opt/conda/lib/python3.7/site-packages/fastcore/dispatch.py in call(self, *args, **kwargs)
121 elif self.inst is not None: f = MethodType(f, self.inst)
122 elif self.owner is not None: f = MethodType(f, self.owner)
→ 123 return f(*args, **kwargs)
124
125 def get(self, inst, owner):

/tmp/ipykernel_11948/34029949.py in get_weed_label(o)
----> 1 def get_weed_label( o): return df[df[“Filename”].str.contains(o)].Species.iloc[0]

/opt/conda/lib/python3.7/site-packages/pandas/core/strings/accessor.py in wrapper(self, *args, **kwargs)
114 )
115 raise TypeError(msg)
→ 116 return func(self, *args, **kwargs)
117
118 wrapper.name = func_name

/opt/conda/lib/python3.7/site-packages/pandas/core/strings/accessor.py in contains(self, pat, case, flags, na, regex)
1151 dtype: bool
1152 “”"
→ 1153 if regex and re.compile(pat).groups:
1154 warnings.warn(
1155 "This pattern has match groups. To actually get the "

/opt/conda/lib/python3.7/re.py in compile(pattern, flags)
234 def compile(pattern, flags=0):
235 “Compile a regular expression pattern, returning a Pattern object.”
→ 236 return _compile(pattern, flags)
237
238 def purge():

/opt/conda/lib/python3.7/re.py in _compile(pattern, flags)
285 return pattern
286 if not sre_compile.isstring(pattern):
→ 287 raise TypeError(“first argument must be string or compiled pattern”)
288 p = sre_compile.compile(pattern, flags)
289 if not (flags & DEBUG):

TypeError: first argument must be string or compiled pattern

Hi @nhenseler ,

In your function, the parameter o is an instance of type PosixPath which is not what the contains function accepts.

https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html

You need to pass it a string or a regular expression pattern.

I think you may simply try

def get_weed_label(o):
    return df[df[“Filename”].str.contains(str(o))].Species.iloc[0]

If you’re just trying to get the entry corresponding to the species column in your series.

Also, if there are multiple labels corresponding to this filename in your df i.e. multilabel classification, you may want to try

def get_weed_label(o):
    labels = df[df[“Filename”].str.contains(str(o))].Species.unique().tolist()
    labels = ",".join(labels)
    return labels

Hope this helps!

Thanks,
Vinayak.

1 Like

thanks a lot Vinayak
The second version works, the first comes back with the same error message.
Now I have to understand what it means that a picture corresponds to multiple labels. Should these pictures be excluded from training?

1 Like

Hi @nhenseler

My bad, yeah the first version will not work.

Assume that the filename match happens at index 23, then your filtered df will not have index 0 in it. For eg. consider the following screenshot which might help you understand this better

df_demo.ipynb · GitHub

So you could modify the first version as follows to make it work

def get_weed_label(o):
    return df[df[“Filename”].str.contains(str(o))].Species.reset_index(drop = True).iloc[0]

As for your second question, if a given image belongs to two or more species, you’ll have to resort to MultiLabel Classification. You could refer this chapter for more information regarding the same.

Hope this helps.

Thanks,
Vinayak.

1 Like