Read data from different directories

Redevil · March 3, 2022, 12:45am

Hello,
I have data with this structure:
./MyFolder/TrainingSet
./MyFolder/ValidationSet

In order to read both training and validation directories I tried:

fnames = get_image_files('./MyFolder', folders=['TrainingSet', 'ValidationSet'])

dblk = DataBlock(blocks=(ImageBlock, MaskBlock(codes)),
                   get_items=fnames,
                   splitter=GrandparentSplitter('TrainingSet', 'ValidationSet'),
                   get_y=get_msk,
                   batch_tfms=[Normalize.from_stats(*imagenet_stats)]

But when I run:

dls = dblk.dataloaders('./MyFolder', bs=256)

I get this error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_35/2775810610.py in <module>
----> 1 dls = dblk.dataloaders('./GlomerTopi', bs=256)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
    111 
    112     def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113         dsets = self.datasets(source, verbose=verbose)
    114         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
    115         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in datasets(self, source, verbose)
    105     def datasets(self, source, verbose=False):
    106         self.source = source                     ; pv(f"Collecting items from {source}", verbose)
--> 107         items = (self.get_items or noop)(source) ; pv(f"Found {len(items)} items", verbose)
    108         splits = (self.splitter or RandomSplitter())(items)
    109         pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)

TypeError: 'L' object is not callable

Has `fastai` a specific function to handle such structured data?

KevinB · March 3, 2022, 5:16am

fnames in this case is a list of files, but get_items is expecting a function to pull the items itself. You could do something like this to give get_items a function.

fnames_func = partial(get_image_files, './MyFolder', folders=['TrainingSet', 'ValidationSet'])

dblk = DataBlock(blocks=(ImageBlock, MaskBlock(codes)),
                   get_items=fnames_func,
                   splitter=GrandparentSplitter('TrainingSet', 'ValidationSet'),
                   get_y=get_msk,
                   batch_tfms=[Normalize.from_stats(*imagenet_stats)]

I think that’s at least where that specific error is coming from. For your other question. I think GrandparentSplitter is possibly an extra layer out than you really want. (tough to say for sure without seeing the full ./MyFolder structure). Assuming that what you really want is a ParentSplitter which doesn’t exist, but is pretty easy to create.

I think it would look something like this:

splitter = FuncSplitter(lambda o: Path(o).parent.name == 'ValidationSet')

Redevil · March 3, 2022, 2:11pm

Thanks
I have tried to pass fnames as a function by partial as you suggested, but know I get this error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_68/2775810610.py in <module>
----> 1 dls = dblk.dataloaders('./GlomerTopi', bs=256)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
    111 
    112     def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113         dsets = self.datasets(source, verbose=verbose)
    114         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
    115         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in datasets(self, source, verbose)
    108         splits = (self.splitter or RandomSplitter())(items)
    109         pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 110         return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
    111 
    112     def dataloaders(self, source, path='.', verbose=False, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    326     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    327         super().__init__(dl_type=dl_type)
--> 328         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    329         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    330 

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in <listcomp>(.0)
    326     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    327         super().__init__(dl_type=dl_type)
--> 328         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    329         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    330 

/opt/conda/lib/python3.7/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     95     def __call__(cls, x=None, *args, **kwargs):
     96         if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97         return super().__call__(x, *args, **kwargs)
     98 
     99 # Cell

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
    252         if do_setup:
    253             pv(f"Setting up {self.tfms}", verbose)
--> 254             self.setup(train_setup=train_setup)
    255 
    256     def _new(self, items, split_idx=None, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in setup(self, train_setup)
    276                 x = f(x)
    277             self.types.append(type(x))
--> 278         types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
    279         self.pretty_types = '\n'.join([f'  - {t}' for t in types])
    280 

TypeError: 'NoneType' object is not iterable

matdmiller · March 6, 2022, 7:49am

Like Kevin said, it’s hard to know if which splitter to use without knowing what the complete folder structure looks like. You can write your own custom functions for the get_items and splitter DataBlock params. Hopefully this helps.

#function for get_items parameter - This should give you the same result as Kevin's partial function.  I have not used the get_image_files function in passing in a list of folders before, so I don't know whether that works without looking it up.
def get_my_files(x):
    return get_image_files('./MyFolder/TrainingSet') + get_image_files('./MyFolder/ValidationSet') 

#Custom Splitter - there are probably better/more efficient ways to write this, but hopefully this leads you in the right direction.
def FileSplitter():
    "Split `items` depending on the value of `mask`."
    def _func(pth_in):
        in_valid = False
        if 'ValidationSet' in str(pth_in):
            in_valid = True
        return in_valid
    def _inner(o, **kwargs): return FuncSplitter(_func)(o)
    return _inner
#splitter=FileSplitter(),