Combining Tabular + Images in fastai2 (and should work with almost any other type)

ecatkins · December 8, 2020, 4:39pm

Hey Zach, did you end up moving this over? Could you point me to where in the repo/website?
Thanks!

Joan · December 11, 2020, 1:26pm

You can check @morgan github repo here

Mohammed · January 6, 2021, 11:49am

Hi @muellerzr thank you for your advices here, I am a new user fast ai library and what I did is build a combining model image and tabular data and already the model is trained. and now I want to predict single record from test data frame, I used this method to integrate input image and tabular data

integratedata,_=get_imagetabdatasets(test_image,tab_data)
and data format of

integratedata[0]

is ((Image (3, 128, 128), TabularLine [tensor([2]), tensor([-0.6136])]),
EmptyLabel 0)

and when I called

learn.predict(integratedata)

the error was: ‘ImageTabDataset’ object has no attribute ‘set_item’ , so what should I do to infer single input or single record from data frame. I hope clear on my question.

I used this notebook as a reference https://github.com/naity/image_tabular/blob/master/siim_isic_integrated_model.ipynb

Saioa · May 28, 2021, 11:56am

Hi all!

I am using the MixedDL to combine Tabular and NLP.

mixedDL1 = MixedDL(self.tab_dl[0], self.nlp_dl[0])
mixedDL2 = MixedDL(self.tab_dl[1], self.nlp_dl[1])

self.dls = DataLoaders(mixedDL1, mixedDL2)

I am using MixedDL class with one_batch function that defines @muellerzr:

def one_batch(self):
    "Grab one batch of data"
    with self.fake_l.no_multiproc(): res = first(self)
    if hasattr(self, 'it'): delattr(self, 'it')
    return res

But when I run this function I get the following error:

  File "/home/admin/PycharmProjects/tabular-nlp/tabular_nlp/concat_model/concat_pipeline.py", line 318, in create_databunch
    batch = mixedDL1.one_batch()
  File "/home/admin/PycharmProjects/tabular-nlp/tabular_nlp/concat_model/concat_pipeline.py", line 93, in one_batch
    res = first(self)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastcore/basics.py", line 547, in first
    return next(x, None)
  File "/home/admin/PycharmProjects/tabular-nlp/tabular_nlp/concat_model/concat_pipeline.py", line 77, in __iter__
    z = zip(*[_loaders[i.fake_l.num_workers == 0](i.fake_l) for i in self.dls])
  File "/home/admin/PycharmProjects/tabular-nlp/tabular_nlp/concat_model/concat_pipeline.py", line 77, in <listcomp>
    z = zip(*[_loaders[i.fake_l.num_workers == 0](i.fake_l) for i in self.dls])
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 552, in __init__
    self._dataset_fetcher = _DatasetKind.create_fetcher(
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 51, in create_fetcher
    return _utils.fetch._IterableDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 21, in __init__
    self.dataset_iter = iter(dataset)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastai/data/load.py", line 30, in __iter__
    def __iter__(self): return iter(self.d.create_batches(self.d.sample()))
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastai/data/load.py", line 103, in sample
    return (b for i,b in enumerate(self.__idxs) if i//(self.bs or 1)%self.num_workers==self.offs)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastcore/basics.py", line 388, in __getattr__
    if attr is not None: return getattr(attr,k)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastcore/basics.py", line 388, in __getattr__
    if attr is not None: return getattr(attr,k)
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastcore/transform.py", line 204, in __getattr__
    def __getattr__(self,k): return gather_attrs(self, k, 'fs')
  File "/home/admin/.virtualenvs/tabular-nlp/lib/python3.8/site-packages/fastcore/transform.py", line 162, in gather_attrs
    if k.startswith('_') or k==nm: raise AttributeError(k)
AttributeError: _DataLoader__idxs

Could someone tell me where this error comes from? Or how can i fix it?
Thanks in advance!

muellerzr · May 28, 2021, 2:21pm

Can you share your full MixedDL code with me that you are using?

Saioa · May 31, 2021, 6:46am

Yes, this is the full MixedDL code that I am using:

class MixedDL:
    def __init__(self, tab_dl: TabDataLoader, nlp_dl: DataLoaders, device="cpu:0"):
        "Stores away `tab_dl` and `vis_dl`, and overrides `shuffle_fn`"
        self.device = device
        tab_dl.shuffle_fn = self.shuffle_fn
        nlp_dl.shuffle_fn = self.shuffle_fn
        self.dls = [tab_dl, nlp_dl]
        self.count = 0
        self.fake_l = _FakeLoader(self, False, 0, 0, 0)

    def __len__(self):
        return len(self.dls[0])

    def shuffle_fn(self, idxs):
        "Generates a new `rng` based upon which `DataLoader` is called"
        if self.count == 0:
            self.rng = self.dls[0].rng.sample(idxs, len(idxs))
            self.count += 1
            return self.rng
        else:
            self.count = 0
            return self.rng

    def to(self, device):
        self.device = device

    def __iter__(self):
        "Iterate over your `DataLoader`"
        z = zip(*[_loaders[i.fake_l.num_workers == 0](i.fake_l) for i in self.dls])
        for b in z:
            if self.device is not None:
                b = to_device(b, self.device)
            batch = []
            batch.extend(self.dls[0].after_batch(b[0])[:2])
            batch.append(self.dls[1].after_batch(b[1][0]))
            try:
                batch.append(b[1][1])
                yield tuple(batch)
            except:
                yield tuple(batch)

    def one_batch(self):
        "Grab a batch from the `DataLoader`"
        with self.fake_l.no_multiproc():
            res = first(self)
        if hasattr(self, "it"):
            delattr(self, "it")
        return res

    def show_batch(self):
        "Show a batch from multiple `DataLoaders`"
        for dl in self.dls:
            dl.show_batch()
            plt.show()

Using this I mixed tabular and nlp dataloaders:

mixedDL1 = MixedDL(self.tab_dl[0], self.nlp_dl[0])
mixedDL2 = MixedDL(self.tab_dl[1], self.nlp_dl[1])

Where self.tab_dl is a TabularDataLoaders, self.tab_dl[0] is a TabDataLoader, self.nlp_dl is a DataLoaders and self.nlp_dl[0] is a SortedDL.

wjlgatech · October 9, 2021, 11:46pm

Hi @muellerzr Zack, fascinating works! Do you have a colab notebook/githut repo to test out this hybrid model? It’s easier for me to follow if you have some sample dataset to play with.

wjlgatech · October 9, 2021, 11:49pm

Hello @Saioa, Glad to see your experiment on the tab+text hybrid! Do you have some update on your experiment? I have a application case want to test out this hybrid approach.

muellerzr · October 9, 2021, 11:54pm

Sadly I do not, the data was proprietary:( but we can debug anything you’re working on together

Saioa · October 14, 2021, 7:46am

Hi @wjlgatech!
No, I didn’t make any more progress on the hybrid model. In the problem I was facing it was enough to add the loss of the NLP model as a column for the tabular model. And that’s how we solved the problem.

Still, at some point I want to go back to this code, so any progress you make on this way keep me up to date.

wjlgatech · October 14, 2021, 1:08pm

Hi @muellerzr, No worries. Totally understand. Thanks for offering insights and help! I will put some public datasets in a colab notebook and let’s work on it together from there.

wjlgatech · October 14, 2021, 1:12pm

Hello @Saioa, it makes sense of your approach. I work around in a similar way: I train a fastai text classifer and extract its embedding as the input for the fastai tabular model. Computationally it is slow, many things still need to be optimized.

Joan · October 14, 2021, 3:56pm

Guess image+tab is not what you need but:

muellerzr · October 14, 2021, 4:33pm

The Gradient Blending approach would also be valuable here I think

wjlgatech · October 15, 2021, 11:46pm

Hi @muellerzr @Joan @Saioa,

I put up an github repo to work on the hybrid model which combining fastai tabular + fastai text.

You can check out the notebook.

Here I still need (helps) to address a bug in my notebook, which is about putting hook on the right place of tabular model and text model. Any advice would be appreciated!

Ha! I just found a great tutorial: Model hooks | fastai

MaramMonshi · October 16, 2021, 11:34am

Hi all.

If you want to play with a small dataset (images and tabular data) to detect COVID-19 from chest x-rays, you can use the COVIDcxr dataset.

It consists of 960 CXR images (i.e., 320 COVID-19, 320 normal, and 320 pneumonia) and the associated tabular data (i.e., gender, sex, and view) for each patient.

You can generate COVIDcxr dataset. Then, build a Mixed DataLoader for it.

wjlgatech · October 16, 2021, 6:05pm

@MaramMonshi It’s great to see that you put the hybrid model to work. Great job!

wjlgatech · October 17, 2021, 4:02am

Hi @muellerzr ,

I was experimenting a hybrid fastai tabular + text model at this notebook: https://github.com/wjlgatech/fastai-multimodal/blob/main/fastai2_multimodal_tabtxt_public.ipynb

At the last step muti_learn.fit_one_cycle() I ran into an error which tracked back to

     38         z = zip(*[_loaders[i.fake_l.num_workers==0](i.fake_l) for i in self.dls])
---> 39         for b in z:
     40             if self.device is not None:
     41                 b = to_device(b, self.device)

*** AttributeError: Caught AttributeError in DataLoader worker process 1.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 237, in _worker_loop
    fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, auto_collation, collate_fn, drop_last)
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 52, in create_fetcher
    return _utils.fetch._IterableDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 21, in __init__
    self.dataset_iter = iter(dataset)
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 30, in __iter__
    def __iter__(self): return iter(self.d.create_batches(self.d.sample()))
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 103, in sample
    return (b for i,b in enumerate(self.__idxs) if i//(self.bs or 1)%self.num_workers==self.offs)
  File "/usr/local/lib/python3.7/dist-packages/fastcore/basics.py", line 388, in __getattr__
    if attr is not None: return getattr(attr,k)
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/core.py", line 335, in __getattr__
    def __getattr__(self,k): return gather_attrs(self, k, 'tls')
  File "/usr/local/lib/python3.7/dist-packages/fastcore/transform.py", line 162, in gather_attrs
    if k.startswith('_') or k==nm: raise AttributeError(k)
AttributeError: _DataLoader__idxs
ipdb> *[_loaders[i.fake_l.num_workers==0](i.fake_l) for i in self.dls]
*** SyntaxError: can't use starred expression here
ipdb> [_loaders[i.fake_l.num_workers==0](i.fake_l) for i in self.dls]
[<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7fd8b13acfd0>, <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fd8b1397810>]
ipdb> *[_loaders[i.fake_l.num_workers==0](i.fake_l) for i in self.dls]: print('haaaa')
*** SyntaxError: illegal target for annotation
ipdb> for b in z: print("ha")
*** AttributeError: Caught AttributeError in DataLoader worker process 2.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 237, in _worker_loop
    fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, auto_collation, collate_fn, drop_last)
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 52, in create_fetcher
    return _utils.fetch._IterableDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 21, in __init__
    self.dataset_iter = iter(dataset)
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 30, in __iter__
    def __iter__(self): return iter(self.d.create_batches(self.d.sample()))
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 103, in sample
    return (b for i,b in enumerate(self.__idxs) if i//(self.bs or 1)%self.num_workers==self.offs)
  File "/usr/local/lib/python3.7/dist-packages/fastcore/basics.py", line 388, in __getattr__
    if attr is not None: return getattr(attr,k)
  File "/usr/local/lib/python3.7/dist-packages/fastai/data/core.py", line 335, in __getattr__
    def __getattr__(self,k): return gather_attrs(self, k, 'tls')
  File "/usr/local/lib/python3.7/dist-packages/fastcore/transform.py", line 162, in gather_attrs
    if k.startswith('_') or k==nm: raise AttributeError(k)
AttributeError: _DataLoader__idxs

Any advice would be much appreciated!