Error in 08_pets_tutorial

I am getting the below error when running this code under Using TfmdDS

t = tdl.one_batch()
x,y = tdl.decode_batch(t)[0]
x.shape,y

A bit lost on what is going on here. If anyone is more familiar with what this is, and could give me pointers on just what I am dealing with I would appreciate it. I am mostly confused at that “torch_6388_2743225712” has to do with Multiprocessing in python? Is that a file on hard disk?

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
Traceback (most recent call last):
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
RuntimeError: unable to write to file </torch_6386_1152570406>
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
Traceback (most recent call last):
RuntimeError: unable to write to file </torch_6388_2743225712>
Traceback (most recent call last):
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
Traceback (most recent call last):
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
RuntimeError: unable to write to file </torch_6393_2317883229>
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
RuntimeError: unable to write to file </torch_6383_1334488111>
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
RuntimeError: unable to write to file </torch_6390_787730122>
Traceback (most recent call last):
RuntimeError: unable to write to file </torch_6386_99458230>
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py", line 234, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/home/fast/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 323, in reduce_storage
    fd, size = storage._share_fd_()
RuntimeError: unable to write to file </torch_6392_2078896466>

---------------------------------------------------------------------------
ConnectionResetError                      Traceback (most recent call last)
~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _try_get_data(self, timeout)
    723         try:
--> 724             data = self.data_queue.get(timeout=timeout)
    725             return (True, data)

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/queues.py in get(self, block, timeout)
    112         # unserialize the data after having released the lock
--> 113         return _ForkingPickler.loads(res)
    114 

~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/multiprocessing/reductions.py in rebuild_storage_fd(cls, df, size)
    283     else:
--> 284         fd = df.detach()
    285     try:

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/resource_sharer.py in detach(self)
     56             '''Get the fd.  This should only be called once.'''
---> 57             with _resource_sharer.get_connection(self._id) as conn:
     58                 return reduction.recv_handle(conn)

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/resource_sharer.py in get_connection(ident)
     86         address, key = ident
---> 87         c = Client(address, authkey=process.current_process().authkey)
     88         c.send((key, os.getpid()))

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/connection.py in Client(address, family, authkey)
    492     if authkey is not None:
--> 493         answer_challenge(c, authkey)
    494         deliver_challenge(c, authkey)

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/connection.py in answer_challenge(connection, authkey)
    731     assert isinstance(authkey, bytes)
--> 732     message = connection.recv_bytes(256)         # reject large message
    733     assert message[:len(CHALLENGE)] == CHALLENGE, 'message = %r' % message

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/connection.py in recv_bytes(self, maxlength)
    215             raise ValueError("negative maxlength")
--> 216         buf = self._recv_bytes(maxlength)
    217         if buf is None:

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/connection.py in _recv_bytes(self, maxsize)
    406     def _recv_bytes(self, maxsize=None):
--> 407         buf = self._recv(4)
    408         size, = struct.unpack("!i", buf.getvalue())

~/anaconda3/envs/fastai_dev/lib/python3.6/multiprocessing/connection.py in _recv(self, size, read)
    378         while remaining > 0:
--> 379             chunk = read(handle, remaining)
    380             n = len(chunk)

ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-38-dbf5a8dda67b> in <module>
----> 1 t = tdl.one_batch()
      2 x,y = tdl.decode_batch(t)[0]
      3 x.shape,y

~/fastai_dev/dev/local/data/load.py in one_batch(self)
     86     def create_item(self, s):  return next(self.it) if s is None else self.dataset[s]
     87     def create_batch(self, b): return (fa_collate,fa_convert)[self.bs is None](b)
---> 88     def one_batch(self):   return next(iter(self))
     89     def do_item(self, s):  return self.after_item(self.create_item(s))
     90     def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)

~/fastai_dev/dev/local/data/load.py in __iter__(self)
     62     def __iter__(self):
     63         self.before_iter()
---> 64         for b in _loaders[self.fake_l.num_workers==0](self.fake_l): yield self.after_batch(b)
     65         self.after_iter()
     66 

~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
    802 
    803             assert not self.shutdown and self.tasks_outstanding > 0
--> 804             idx, data = self._get_data()
    805             self.tasks_outstanding -= 1
    806 

~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _get_data(self)
    769         else:
    770             while True:
--> 771                 success, data = self._try_get_data()
    772                 if success:
    773                     return data

~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _try_get_data(self, timeout)
    722         #   (bool: whether successfully get data, any: data if successful else None)
    723         try:
--> 724             data = self.data_queue.get(timeout=timeout)
    725             return (True, data)
    726         except Exception as e:

~/anaconda3/envs/fastai_dev/lib/python3.6/site-packages/torch/utils/data/_utils/signal_handling.py in handler(signum, frame)
     64         # This following call uses `waitid` with WNOHANG from C side. Therefore,
     65         # Python can still get and update the process status successfully.
---> 66         _error_if_any_worker_fails()
     67         if previous_handler is not None:
     68             previous_handler(signum, frame)

RuntimeError: DataLoader worker (pid 6385) is killed by signal: Bus error.

I’m wondering if it’s something to do with this error ^ did you try it multiple times or just the one?

Multiple times, same error, it also completely kills the kernel.

Hmm, ok are you on the very latest version? I’m one commit behind master at the moment, but I just tested and it works on my end.

Ah, okay, then I will go about updating then. I am on one week old code.

Hopefully that fixes it. Post an update either way and I will keep looking at reproducing if it doesn’t!

We’re making changes throughout the day, every day, so you should git pull each time you look at fastai v2.

1 Like

Updated everything and now seems to work.

1 Like