CUDA error: unknown error/ [Errno 32] Broken pipe

macphini · January 3, 2020, 6:16pm

I have tried many times to reproduce the tutorials in Fastai on my own but it usually ends up in some frustrating error message, I love this Library but I have not been able to run anything with it. Please can anyone help guide me just in case I am doing something wrong. I have a Windows 10, with CUDA 10.0 on a GEFORCE GTX. below is the error message I keep getting

on running:
learn = cnn_learner(data, models.resnet18, metrics=accuracy)
learn.fit_one_cycle(1,1e-2)
learn.save(‘mini_train’)

RuntimeError Traceback (most recent call last)
in
----> 1 learn = cnn_learner(data, models.resnet18, metrics=accuracy)
2 learn.fit_one_cycle(1,1e-2)
3 learn.save(‘mini_train’)

D:\makin\lib\site-packages\fastai\vision\learner.py in cnn_learner(data, base_arch, cut, pretrained, lin_ftrs, ps, custom_head, split_on, bn_final, init, concat_pool, **kwargs)
96 model = create_cnn_model(base_arch, data.c, cut, pretrained, lin_ftrs, ps=ps, custom_head=custom_head,
97 split_on=split_on, bn_final=bn_final, concat_pool=concat_pool)
—> 98 learn = Learner(data, model, **kwargs)
99 learn.split(split_on or meta[‘split’])
100 if pretrained: learn.freeze()

in init(self, data, model, opt_func, loss_func, metrics, true_wd, bn_wd, wd, train_bn, path, model_dir, callback_fns, callbacks, layer_groups, add_time, silent)

D:\makin\lib\site-packages\fastai\basic_train.py in post_init(self)
164 self.path = Path(ifnone(self.path, self.data.path))
165 (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)
–> 166 self.model = self.model.to(self.data.device)
167 self.loss_func = self.loss_func or self.data.loss_func
168 self.metrics=listify(self.metrics)

D:\makin\lib\site-packages\torch\nn\modules\module.py in to(self, *args, **kwargs)
384 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
385
–> 386 return self._apply(convert)
387
388 def register_backward_hook(self, hook):

D:\makin\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
191 def _apply(self, fn):
192 for module in self.children():
–> 193 module._apply(fn)
194
195 for param in self._parameters.values():

D:\makin\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
197 # Tensors stored in modules are graph leaves, and we don’t
198 # want to create copy nodes, so we have to unpack the data.
–> 199 param.data = fn(param.data)
200 if param._grad is not None:
201 param._grad.data = fn(param._grad.data)

D:\makin\lib\site-packages\torch\nn\modules\module.py in convert(t)
382
383 def convert(t):
–> 384 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
385
386 return self._apply(convert)

D:\makin\lib\site-packages\torch\cuda_init_.py in _lazy_init()
161 "Cannot re-initialize CUDA in forked subprocess. " + msg)
162 _check_driver()
–> 163 torch._C._cuda_init()
164 _cudart = _load_cudart()
165 _cudart.cudaGetErrorName.restype = ctypes.c_char_p

RuntimeError: CUDA error: unknown error

Another error here on running:
data.show_batch(rows=3, figsize=(4,4))

BrokenPipeError Traceback (most recent call last)
in
----> 1 data.show_batch(rows=3, figsize=(4,4), num_workers=0)

D:\makin\lib\site-packages\fastai\basic_data.py in show_batch(self, rows, ds_type, reverse, **kwargs)
183 def show_batch(self, rows:int=5, ds_type:DatasetType=DatasetType.Train, reverse:bool=False, **kwargs)->None:
184 “Show a batch of data in ds_type on a few rows.”
–> 185 x,y = self.one_batch(ds_type, True, True)
186 if reverse: x,y = x.flip(0),y.flip(0)
187 n_items = rows **2 if self.train_ds.x._square_show else rows

D:\makin\lib\site-packages\fastai\basic_data.py in one_batch(self, ds_type, detach, denorm, cpu)
166 w = self.num_workers
167 self.num_workers = 0
–> 168 try: x,y = next(iter(dl))
169 finally: self.num_workers = w
170 if detach: x,y = to_detach(x,cpu=cpu),to_detach(y,cpu=cpu)

D:\makin\lib\site-packages\fastai\basic_data.py in iter(self)
73 def iter(self):
74 “Process and returns items from DataLoader.”
—> 75 for b in self.dl: yield self.proc_batch(b)
76
77 @classmethod

D:\makin\lib\site-packages\torch\utils\data\dataloader.py in iter(self)
191
192 def iter(self):
–> 193 return _DataLoaderIter(self)
194
195 def len(self):

D:\makin\lib\site-packages\torch\utils\data\dataloader.py in init(self, loader)
467 # before it starts, and del tries to join but will get:
468 # AssertionError: can only join a started process.
–> 469 w.start()
470 self.index_queues.append(index_queue)
471 self.workers.append(w)

D:\makin\lib\multiprocessing\process.py in start(self)
110 ‘daemonic processes are not allowed to have children’
111 _cleanup()
–> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect

D:\makin\lib\multiprocessing\context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
–> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):

D:\makin\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
–> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):

D:\makin\lib\multiprocessing\popen_spawn_win32.py in init(self, process_obj)
87 try:
88 reduction.dump(prep_data, to_child)
—> 89 reduction.dump(process_obj, to_child)
90 finally:
91 set_spawning_popen(None)

D:\makin\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 ‘’‘Replacement for pickle.dump() using ForkingPickler.’’’
—> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #

BrokenPipeError: [Errno 32] Broken pipe

mrfabulous1 · January 4, 2020, 8:45pm

Hi macphini Hope all well!

Have you searched the forum for “windows 10” there are many threads. Someone on these
threads might have the piece of information you need to solve your problem.
Below is one such example.

I use unix so am not familiar with the windows setup.

Cheers mrfabulous1

shahid · January 5, 2020, 4:51am

If you’re using windows, then you would have encountered this problem. I solved it by setting the parameter num_workers=0 in the databunch function. Adding a snip for the same DatabunchError

CUDA error: unknown error/ [Errno 32] Broken pipe

on running: learn = cnn_learner(data, models.resnet18, metrics=accuracy) learn.fit_one_cycle(1,1e-2) learn.save(‘mini_train’)

Another error here on running: data.show_batch(rows=3, figsize=(4,4))

on running:
learn = cnn_learner(data, models.resnet18, metrics=accuracy)
learn.fit_one_cycle(1,1e-2)
learn.save(‘mini_train’)

Another error here on running:
data.show_batch(rows=3, figsize=(4,4))