Custom ItemList, getting ForkingPickler broken pipe

etremblay · February 24, 2019, 3:51am

I am trying to implement a custom ItemList simply to merge some Image data with some tabular data. Right now I am simply trying to make it work with my custom classes and my custom ItemBase simply return the image data to test with a simple cnn model…

I didn’t want to rewrite all the custom logic inside ImageItemList and TabularList so internally my custom ItemList simply create them internally so that I can re-use them. The problem is that I seem to be getting a into a parallelism problem I think and getting a ForkingPickler broken pipe error message. Do you think that re-using existing ItemList inside a custom ItemList could be the problem? When I set num_worker=0, then lr_find seems to run, but very very slowly.

Here is my code:

gist.github.com

https://gist.github.com/EtienneT/c07994bc96e9fad7a30a89cb9f20bc6b

ImageTabularDataBunch.py


#%%
from fastai.tabular import *
from fastai.vision import *
from fastai.metrics import *

def _maybe_add_crop_pad(tfms):
    assert is_listy(tfms) and len(tfms) == 2, "Please pass a list of two lists of transforms (train and valid)."
    tfm_names = [[tfm.__name__ for tfm in o] for o in tfms]
    return [([crop_pad()] + o if 'crop_pad' not in n else o) for o,n in zip(tfms, tfm_names)]

This file has been truncated. show original

And here is the exception I get while running learn.lr_find():

---------------------------------------------------------------------------
BrokenPipeError                           Traceback (most recent call last)
<ipython-input-19-c7a9c29f9dd1> in <module>
----> 1 learn.lr_find()
      2 learn.recorder.plot()

c:\work\ml\fastai-dev\fastai\fastai\train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
     30     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     31     a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 32     learn.fit(a, start_lr, callbacks=[cb], wd=wd)
     33 
     34 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(self, epochs, lr, wd, callbacks)
    176         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    177         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 178             callbacks=self.callbacks+callbacks)
    179 
    180     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

c:\work\ml\fastai-dev\fastai\fastai\utils\mem.py in wrapper(*args, **kwargs)
    101 
    102         try:
--> 103             return func(*args, **kwargs)
    104         except Exception as e:
    105             if ("CUDA out of memory" in str(e) or

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     86             cb_handler.set_dl(data.train_dl)
     87             cb_handler.on_epoch_begin()
---> 88             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     89                 xb, yb = cb_handler.on_batch_begin(xb, yb)
     90                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

~\Anaconda3\envs\fastai-master\lib\site-packages\fastprogress\fastprogress.py in __iter__(self)
     63         self.update(0)
     64         try:
---> 65             for i,o in enumerate(self._gen):
     66                 yield o
     67                 if self.auto_update: self.update(i+1)

c:\work\ml\fastai-dev\fastai\fastai\basic_data.py in __iter__(self)
     69     def __iter__(self):
     70         "Process and returns items from `DataLoader`."
---> 71         for b in self.dl: yield self.proc_batch(b)
     72 
     73     @classmethod

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
    817 
    818     def __iter__(self):
--> 819         return _DataLoaderIter(self)
    820 
    821     def __len__(self):

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
    558                 #     before it starts, and __del__ tries to join but will get:
    559                 #     AssertionError: can only join a started process.
--> 560                 w.start()
    561                 self.index_queues.append(index_queue)
    562                 self.workers.append(w)

~\Anaconda3\envs\fastai-master\lib\multiprocessing\process.py in start(self)
    110                'daemonic processes are not allowed to have children'
    111         _cleanup()
--> 112         self._popen = self._Popen(self)
    113         self._sentinel = self._popen.sentinel
    114         # Avoid a refcycle if the target function holds an indirect

~\Anaconda3\envs\fastai-master\lib\multiprocessing\context.py in _Popen(process_obj)
    221     @staticmethod
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 
    225 class DefaultContext(BaseContext):

~\Anaconda3\envs\fastai-master\lib\multiprocessing\context.py in _Popen(process_obj)
    320         def _Popen(process_obj):
    321             from .popen_spawn_win32 import Popen
--> 322             return Popen(process_obj)
    323 
    324     class SpawnContext(BaseContext):

~\Anaconda3\envs\fastai-master\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
     63             try:
     64                 reduction.dump(prep_data, to_child)
---> 65                 reduction.dump(process_obj, to_child)
     66             finally:
     67                 set_spawning_popen(None)

~\Anaconda3\envs\fastai-master\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
     58 def dump(obj, file, protocol=None):
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 
     62 #

BrokenPipeError: [Errno 32] Broken pipe

etremblay · February 24, 2019, 6:45pm

So since I am on windows, it seems like I have to wrap my code with a special if to avoid process forking creating problems:

if name == ‘main’:
…code doing the training loop…

Not a big deal but @sgugger do you have any idea how we could avoid that?

Thanks!

sgugger · February 24, 2019, 6:56pm

I don’t use PyTorch much on Windows. Thanks for sharing the workaround, but there’s little we can do in fastai, it has to be fixed on PyTorch level.

etremblay · February 24, 2019, 8:07pm

Thanks for your quick response! There must be something I can do because I don’t have to use this if normally just by using fastai. It only does this with my custom ItemList. Will post solution here if I ever find why.

Thanks,

sgugger · February 24, 2019, 9:16pm

Ah, maybe you need to package your custom ItemList to make pickle happy.

etremblay · February 24, 2019, 9:41pm

Hum, excellent I will look into that!

While playing with the ItemList api and reusing existing ItemList for my purposes, do you think it would be a good idea to have a special kind of ItemList that simply concatenate “get” results from other ItemLists?

Basically having a MergingItemList where you simply pass it a list of ItemList and the get method of MergingItemList would simply call the get method of all the ItemList you passed to it and would concatenate the results into a vector. This is pseudo-code, but something along those lines:

imgList = ImageItemList.from_df(…)
tabList = TabularList.from_df(…)

mergedList = MergingItemList.from_itemlist([imgList, tabList])

Then a call to mergedList.get(i) would return a vector of the result of imgList.get and tabList.get while respecting image transforms etc.

I feel this would be a good entry point for combining different types of data to use in a model while re-using parts of the existing API. Maybe paving the way eventually to something like Ludwig where you can pass-in any kind of data, let the model combine them using some encoder, then output any kind of data out from the model (images, sequences, classification, regression etc).

Obviously I am only talking about the data block part of this, but I feel like fastai should also make it easy to merge different types of data.

Thanks!

sgugger · February 25, 2019, 2:23am

Interesting point, and definitely something we can look at. You would need your custom models afterward, as the data would be fed in a format like batch_of_images,batch_of_tabular in your example, but it’s not too hard to add and I can see how people could find it useful.

tcapelle · February 25, 2019, 8:47am

Hello, I am am very interested in this problem. I was playing feeding images with their metadata (focal length, diaphragm, etc…) but don’t know what would be the best custom model to feed this together. Continuous + Cat variables + Image, I was thinking about concatenating the outputs of both networks (resnet+tabular) and creating a custom head for both, after a Cat.
I really think that merging data types has lots of useful applications.

etremblay · February 25, 2019, 3:24pm

Yes custom model afterward is fine. But the idea is re-using existing fastai components to be able to merge different type of data together. For example in my case, I have a Question (text data), the question gets answered, images can be associated to it, I have tabular data about the context of the question and I want to predict if the question is going to pass or fail.

Before even thinking about the model, I need to feed the model with the data. But once we have the data, why not re-use the different parts of fastai like pre-trained image model, pre-trained language model, categorical variable embeddings etc to merge them together and help predict what we are interested in.

Industry datasets are much more heterogeneous than academic datasets. I think merging all those types of data and making it easy to use transfer learning for the different parts would be really interesting for industry.

sgugger · February 25, 2019, 4:42pm

Ok, pushed MixedItemList in master. Here is an example of use:

path = untar_data(URLs.MNIST_SAMPLE)
df = pd.read_csv(path/'labels.csv')
image_il = ImageItemList.from_df(df.iloc[:1000], path=path, cols='name')

path1 = untar_data(URLs.IMDB_SAMPLE)
text_il = TextList.from_csv(path1, 'texts.csv', cols='text')

path2 = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path2/'adult.csv')

dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]

tab_il = TabularList.from_df(df.iloc[:1000], path=path2, cat_names=cat_names, cont_names=cont_names, procs=procs)

tst_il = MixedItemList([image_il, text_il, tab_il], path=path)

Note that this one won’t be able to get to the databunch stage since the texts aren’t all of the same length so can’t be directly batched together without writing a custom collate function, but each data is processed properly after the split and labeling.

If you want to take the labels from one of the csv, let’s say the first one, you have to tell the MiexItemList which inner dataframe to use (here the one from image_il):

tst_il = MixedItemList([image_il, text_il, tab_il], path=path, inner_df=image_il.inner_df)

If you want to apply data augmentation, when you are the time to call transforms, you need to pass two lists (train/valid) of three lists (image_il, text_il, tab_il) of transforms. For instance:

src = tst_il.random_split_by_pct()
src = src.label_from_df(cols='label')
src = src.transform([[[pad(padding=4)], [], []], [[],[],[]]])

As I said, src.databunch() doesn’t work since there is text, if remove the text ItemList and adapt, it works perfectly and the batches will look like:
[[batch of images, [batch of cats, batch of conts]], batch of labels]
(or more generally [[batch of first il, batch of second il, …], batch of labels])

As I noted before, this will require a custom model to work, but you get all the processing done together.

etremblay · February 25, 2019, 6:20pm

Wow that was fast! Really really nice!

And now to the actual hard task of writing my first custom PyTorch model :). Will start with Image + tabular to start.

Thanks @sgugger!

etremblay · February 27, 2019, 1:39am

I am getting the following exception while trying to do lr_find while using MixedItemList:

invalid argument 0: Sizes of tensors must match except in dimension 0. Got 225 and 300 in dimension 2

I am using the new MixedItemList with just Image and tabular data to start “simple”.

Here is my setup code for the learner:

imgList = ImageList.from_df(df, path=path, cols='PicturePath')
tabList = TabularList.from_df(df, cat_names=cat_names, cont_names=cont_names, procs=procs)

mixed = MixedItemList([imgList, tabList], path, inner_df=imgList.inner_df).random_split_by_pct(.2).label_from_df(cols='Passed')
data = mixed.databunch(no_check=True, bs=64, num_workers=0) # num_workers=0 here just to get errors more quickly

emb = mixed.train.x.item_lists[1].get_emb_szs()
model = ImageTabularModel(emb, len(cont_names), [1000, 500])

learn = Learner(data, model, metrics=accuracy)

Here is my model (this is my first pytorch model, so not 100% certain I am doing this correctly)… I tried to reuse as much stuff from fastai as I could:

from fastai.torch_core import *
from fastai.vision import *
from fastai.tabular.models import *
from fastai.tabular import *
from fastai.layers import *
import torch

class ImageTabularModel(nn.Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs:ListSizes, n_cont:int, layers:Collection[int], ps:Collection[float]=None):
        super().__init__()
        self.cnn = create_body(models.resnet34)
        self.tab = TabularModel(emb_szs, n_cont, 512, layers, ps)

        self.reduce = nn.Sequential(*([Flatten()] + bn_drop_lin((512*7*7), 512, bn=True, p=0.5, actn=nn.ReLU(inplace=True))))
        self.merge = nn.Sequential(*bn_drop_lin(512 + 512, 1024, bn=True, p=0.5, actn=nn.ReLU(inplace=True)))
        self.final = nn.Sequential(*bn_drop_lin(1024, 2, bn=True, p=0., actn=nn.ReLU(inplace=True)))

    def forward(self, img:Tensor, x_cat:Tensor, x_cont:Tensor) -> Tensor:
        imgLatent = self.reduce(self.cnn(img))
        tabLatent = self.tab(x_cat, x_cont)

        cat = torch.cat(imgLatent, tabLatent)

        return self.final(self.merge(cat))

But then when I call learn.lr_find(), I get the following exception:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-19-c7a9c29f9dd1> in <module>
----> 1 learn.lr_find()
      2 learn.recorder.plot()

c:\work\ml\fastai-dev\fastai\fastai\train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
     30     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     31     epochs = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 32     learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)
     33 
     34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=False, clip:float=None,

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(self, epochs, lr, wd, callbacks)
    180         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
    181         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 182             callbacks=self.callbacks+callbacks)
    183 
    184     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

c:\work\ml\fastai-dev\fastai\fastai\utils\mem.py in wrapper(*args, **kwargs)
     87 
     88         try:
---> 89             return func(*args, **kwargs)
     90         except Exception as e:
     91             if ("CUDA out of memory" in str(e) or

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     88             cb_handler.set_dl(data.train_dl)
     89             cb_handler.on_epoch_begin()
---> 90             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     91                 xb, yb = cb_handler.on_batch_begin(xb, yb)
     92                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

~\Anaconda3\envs\fastai-master\lib\site-packages\fastprogress\fastprogress.py in __iter__(self)
     63         self.update(0)
     64         try:
---> 65             for i,o in enumerate(self._gen):
     66                 yield o
     67                 if self.auto_update: self.update(i+1)

c:\work\ml\fastai-dev\fastai\fastai\basic_data.py in __iter__(self)
     73     def __iter__(self):
     74         "Process and returns items from `DataLoader`."
---> 75         for b in self.dl: yield self.proc_batch(b)
     76 
     77     @classmethod

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    613         if self.num_workers == 0:  # same-process loading
    614             indices = next(self.sample_iter)  # may raise StopIteration
--> 615             batch = self.collate_fn([self.dataset[i] for i in indices])
    616             if self.pin_memory:
    617                 batch = pin_memory_batch(batch)

c:\work\ml\fastai-dev\fastai\fastai\torch_core.py in data_collate(batch)
    115 def data_collate(batch:ItemsList)->Tensor:
    116     "Convert `batch` items to tensor data."
--> 117     return torch.utils.data.dataloader.default_collate(to_data(batch))
    118 
    119 def requires_grad(m:nn.Module, b:Optional[bool]=None)->Optional[bool]:

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in default_collate(batch)
    230     elif isinstance(batch[0], container_abcs.Sequence):
    231         transposed = zip(*batch)
--> 232         return [default_collate(samples) for samples in transposed]
    233 
    234     raise TypeError((error_msg.format(type(batch[0]))))

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in <listcomp>(.0)
    230     elif isinstance(batch[0], container_abcs.Sequence):
    231         transposed = zip(*batch)
--> 232         return [default_collate(samples) for samples in transposed]
    233 
    234     raise TypeError((error_msg.format(type(batch[0]))))

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in default_collate(batch)
    230     elif isinstance(batch[0], container_abcs.Sequence):
    231         transposed = zip(*batch)
--> 232         return [default_collate(samples) for samples in transposed]
    233 
    234     raise TypeError((error_msg.format(type(batch[0]))))

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in <listcomp>(.0)
    230     elif isinstance(batch[0], container_abcs.Sequence):
    231         transposed = zip(*batch)
--> 232         return [default_collate(samples) for samples in transposed]
    233 
    234     raise TypeError((error_msg.format(type(batch[0]))))

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\utils\data\dataloader.py in default_collate(batch)
    207             storage = batch[0].storage()._new_shared(numel)
    208             out = batch[0].new(storage)
--> 209         return torch.stack(batch, 0, out=out)
    210     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
    211             and elem_type.__name__ != 'string_':

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 225 and 300 in dimension 3 at c:\a\w\1\s\tmp_conda_3.7_061434\conda\conda-bld\pytorch_1544163540495\work\aten\src\th\generic/THTensorMoreMath.cpp:1333

etremblay · February 27, 2019, 2:06am

Searching in the forum I see now that this is related to the image sizes. So digging a bit I managed to make MixedItemList work I think and applying image transforms:

mixed = (MixedItemList([imgList, tabList], path, inner_df=imgList.inner_df)
         .random_split_by_pct(.2)
         .label_from_df(cols='Passed')
         .transform([[get_transforms()[0], []], [get_transforms()[1], []]], size=224))

But then my model forward method might not be declared correctly, I am getting the following error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-34-c7a9c29f9dd1> in <module>
----> 1 learn.lr_find()
      2 learn.recorder.plot()

c:\work\ml\fastai-dev\fastai\fastai\train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
     30     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     31     epochs = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 32     learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)
     33 
     34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=False, clip:float=None,

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(self, epochs, lr, wd, callbacks)
    180         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
    181         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 182             callbacks=self.callbacks+callbacks)
    183 
    184     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

c:\work\ml\fastai-dev\fastai\fastai\utils\mem.py in wrapper(*args, **kwargs)
     87 
     88         try:
---> 89             return func(*args, **kwargs)
     90         except Exception as e:
     91             if ("CUDA out of memory" in str(e) or

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     90             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     91                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 92                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     93                 if cb_handler.on_batch_end(loss): break
     94 

c:\work\ml\fastai-dev\fastai\fastai\basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     22     if not is_listy(xb): xb = [xb]
     23     if not is_listy(yb): yb = [yb]
---> 24     out = model(*xb)
     25     out = cb_handler.on_loss_begin(out)
     26 

~\Anaconda3\envs\fastai-master\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

TypeError: forward() missing 1 required positional argument: 'x_cont'

etremblay · February 27, 2019, 3:23am

Took a pause, figured out what was wrong with my model. Here is my model for anyone interested.

class ImageTabularModel(nn.Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs:ListSizes, n_cont:int, layers:Collection[int], ps:Collection[float]=None):
        super().__init__()
        self.cnn = create_body(models.resnet34)
        self.tab = TabularModel(emb_szs, n_cont, 512, layers, ps)

        self.reduce = nn.Sequential(*([Flatten()] + bn_drop_lin((512*7*7), 512, bn=True, p=0.5, actn=nn.ReLU(inplace=True))))
        self.merge = nn.Sequential(*bn_drop_lin(512 + 512, 1024, bn=True, p=0.5, actn=nn.ReLU(inplace=True)))
        self.final = nn.Sequential(*bn_drop_lin(1024, 2, bn=True, p=0., actn=nn.ReLU(inplace=True)))

    def forward(self, img:Tensor, x:Tensor) -> Tensor:
        imgLatent = self.reduce(self.cnn(img))
        tabLatent = self.tab(x[0], x[1])

        cat = torch.cat([imgLatent, tabLatent], dim=1)

        return self.final(self.merge(cat))

etremblay · March 1, 2019, 9:07pm

Where can learn more about custom collate function if I want to add a language model to that?

Thanks,

Elfayoumi · March 16, 2019, 7:49am

Hello,
What is ImageItemList, I cannot find documentation on that.
How to create three imageLists and join them in mixedItemList?
thanks

sgugger · March 16, 2019, 12:21pm

It’s been renamed ImageList (you can follow all changes here).

Elfayoumi · March 16, 2019, 12:54pm

thanks.

blissweb · March 17, 2019, 1:47pm

I just updated fastai from git because this command did not include the ‘test_df’ parameter:

data3 = TabularDataBunch.from_df(CUR_DIR, train_df, dep_var, test_df=test_df, valid_idx=valid_idx, procs=procs, cat_names=cat_names, cont_names=cont_names)

Now the test_df does get added as I can see using:

data3.test_ds

which produces:

LabelList (200000 items)
x: TabularList
var_0 11.0625; var_1 7.7812; ,var_0 8.5312; var_1 1.2539; ,var_0 5.4844; var_1 -10.3594; ,var_0 8.5391; var_1 -1.3223; ,var_0 11.7031; var_1 -0.1327; 
y: EmptyLabelList
,,,,
Path: c:\data\code\jupyter\kaggle_santander

However, when I run:

data3.show_batch(rows=10,ds_type=DatasetType.Train)

or try to do an lr_find I get a BrokenPipe like:

---------------------------------------------------------------------------
BrokenPipeError                           Traceback (most recent call last)
<ipython-input-9-8982f998d792> in <module>
----> 1 data3.show_batch(rows=10,ds_type=DatasetType.Train)

C:\Anaconda3\envs\fastai-latest\lib\site-packages\fastai\basic_data.py in show_batch(self, rows, ds_type, reverse, **kwargs)
    183     def show_batch(self, rows:int=5, ds_type:DatasetType=DatasetType.Train, reverse:bool=False, **kwargs)->None:
    184         "Show a batch of data in `ds_type` on a few `rows`."
--> 185         x,y = self.one_batch(ds_type, True, True)
    186         if reverse: x,y = x.flip(0),y.flip(0)
    187         n_items = rows **2 if self.train_ds.x._square_show else rows

C:\Anaconda3\envs\fastai-latest\lib\site-packages\fastai\basic_data.py in one_batch(self, ds_type, detach, denorm, cpu)
    166         w = self.num_workers
    167         self.num_workers = 0
--> 168         try:     x,y = next(iter(dl))
    169         finally: self.num_workers = w
    170         if detach: x,y = to_detach(x,cpu=cpu),to_detach(y,cpu=cpu)

C:\Anaconda3\envs\fastai-latest\lib\site-packages\fastai\basic_data.py in __iter__(self)
     73     def __iter__(self):
     74         "Process and returns items from `DataLoader`."
---> 75         for b in self.dl: yield self.proc_batch(b)
     76 
     77     @classmethod

C:\Anaconda3\envs\fastai-latest\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
    817 
    818     def __iter__(self):
--> 819         return _DataLoaderIter(self)
    820 
    821     def __len__(self):

C:\Anaconda3\envs\fastai-latest\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
    558                 #     before it starts, and __del__ tries to join but will get:
    559                 #     AssertionError: can only join a started process.
--> 560                 w.start()
    561                 self.index_queues.append(index_queue)
    562                 self.workers.append(w)

C:\Anaconda3\envs\fastai-latest\lib\multiprocessing\process.py in start(self)
    103                'daemonic processes are not allowed to have children'
    104         _cleanup()
--> 105         self._popen = self._Popen(self)
    106         self._sentinel = self._popen.sentinel
    107         # Avoid a refcycle if the target function holds an indirect

C:\Anaconda3\envs\fastai-latest\lib\multiprocessing\context.py in _Popen(process_obj)
    221     @staticmethod
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 
    225 class DefaultContext(BaseContext):

C:\Anaconda3\envs\fastai-latest\lib\multiprocessing\context.py in _Popen(process_obj)
    320         def _Popen(process_obj):
    321             from .popen_spawn_win32 import Popen
--> 322             return Popen(process_obj)
    323 
    324     class SpawnContext(BaseContext):

C:\Anaconda3\envs\fastai-latest\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
     63             try:
     64                 reduction.dump(prep_data, to_child)
---> 65                 reduction.dump(process_obj, to_child)
     66             finally:
     67                 set_spawning_popen(None)

C:\Anaconda3\envs\fastai-latest\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
     58 def dump(obj, file, protocol=None):
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 
     62 #

BrokenPipeError: [Errno 32] Broken pipe

I have updated all the packages, tried Pytorch 1.0.0 and 1.0.1 and still get this problem. Obviously I’m running on Windows. Any help would be appreciated.

sgugger · March 17, 2019, 2:45pm

That’s generally the multiprocessing in windows with PyTorch, so you should try with num_workers=0.