GCP platform , lesson 1 , failed execution of ```learn.fit_one_cycle(4)```

barak.beilin · May 1, 2020, 2:13pm

I’m stuggling to run the notebook of the first lesson to run on GCP, I followed the tutorial of creatign a new instance on GCP, everything runs up the 51’th line learn.fit_one_cycle(4), After which I get :

epoch	train_loss	valid_loss	error_rate	time

 Interrupted
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-51-495233eaf2b4> in <module>
----> 1 learn.fit_one_cycle(4)

/opt/conda/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
     21     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
     22                                        final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 23     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     24 
     25 def fit_fc(learn:Learner, tot_epochs:int=1, lr:float=defaults.lr,  moms:Tuple[float,float]=(0.95,0.85), start_pct:float=0.72,

/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    198         else: self.opt.lr,self.opt.wd = lr,wd
    199         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
--> 200         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    201 
    202     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    104             if not cb_handler.skip_validate and not learn.data.empty_val:
    105                 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
--> 106                                        cb_handler=cb_handler, pbar=pbar)
    107             else: val_loss=None
    108             if cb_handler.on_epoch_end(val_loss): break

/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
     55         val_losses,nums = [],[]
     56         if cb_handler: cb_handler.set_dl(dl)
---> 57         for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
     58             if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
     59             val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)

/opt/conda/lib/python3.7/site-packages/fastprogress/fastprogress.py in __iter__(self)
     45         except Exception as e:
     46             self.on_interrupt()
---> 47             raise e
     48 
     49     def update(self, val):

/opt/conda/lib/python3.7/site-packages/fastprogress/fastprogress.py in __iter__(self)
     39         if self.total != 0: self.update(0)
     40         try:
---> 41             for i,o in enumerate(self.gen):
     42                 if i >= self.total: break
     43                 yield o

/opt/conda/lib/python3.7/site-packages/fastai/basic_data.py in __iter__(self)
     73     def __iter__(self):
     74         "Process and returns items from `DataLoader`."
---> 75         for b in self.dl: yield self.proc_batch(b)
     76 
     77     @classmethod

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __iter__(self)
    277             return _SingleProcessDataLoaderIter(self)
    278         else:
--> 279             return _MultiProcessingDataLoaderIter(self)
    280 
    281     @property

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __init__(self, loader)
    717             #     before it starts, and __del__ tries to join but will get:
    718             #     AssertionError: can only join a started process.
--> 719             w.start()
    720             self._index_queues.append(index_queue)
    721             self._workers.append(w)

/opt/conda/lib/python3.7/multiprocessing/process.py in start(self)
    110                'daemonic processes are not allowed to have children'
    111         _cleanup()
--> 112         self._popen = self._Popen(self)
    113         self._sentinel = self._popen.sentinel
    114         # Avoid a refcycle if the target function holds an indirect

/opt/conda/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
    221     @staticmethod
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 
    225 class DefaultContext(BaseContext):

/opt/conda/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
    275         def _Popen(process_obj):
    276             from .popen_fork import Popen
--> 277             return Popen(process_obj)
    278 
    279     class SpawnProcess(process.BaseProcess):

/opt/conda/lib/python3.7/multiprocessing/popen_fork.py in __init__(self, process_obj)
     18         self.returncode = None
     19         self.finalizer = None
---> 20         self._launch(process_obj)
     21 
     22     def duplicate_for_child(self, fd):

/opt/conda/lib/python3.7/multiprocessing/popen_fork.py in _launch(self, process_obj)
     68         code = 1
     69         parent_r, child_w = os.pipe()
---> 70         self.pid = os.fork()
     71         if self.pid == 0:
     72             try:

OSError: [Errno 12] Cannot allocate memory

Thanks,
Barak

kogam22 · May 2, 2020, 4:12am

You might need to reduce the batch size in your script. Say, try 32 or 16.

bs = 128  # <- This 
data_f = ImageDataBunch.from_name_re(path=path, 
                                     pat='([^/]+)_\d+.jpg$',
                                     fnames=img_paths3, test="/content/test/",
                                     bs=bs, size=48)

Alternatively, try using Google Colab. It’s free. I use 128 batch size with (48,48) images and there’s still memory left.