LR finder is only a rough guide in any situation @devforfu - generally you’ll want to try around 4x lower and 4x higher LRs too.
I think ProgressCB
is fantastic, but I thought it would be better to display how long it took to finish the epoch, like fastai
. So, I looked into some fastprogress
source code and came up with this:
from time import time
from copy import copy
from torcheval.metrics import Mean
from .datasets import *
from .learner import *
# Code from fastcore https://github.com/fastai/fastprogress/blob/master/fastprogress/core.py#L7
def format_time(t):
"Format `t` (in seconds) to (h):mm:ss"
t = int(t)
h,m,s = t//3600, (t//60)%60, t%60
if h!= 0: return f'{h}:{m:02d}:{s:02d}'
else: return f'{m:02d}:{s:02d}'
class MetricsCB(Callback):
def __init__(self, *ms, **metrics):
for o in ms: metrics[type(o).__name__] = o
self.metrics = metrics
self.all_metrics = copy(metrics)
self.all_metrics['loss'] = self.loss = Mean()
def _log(self, x): print(x)
def before_fit(self, learn): learn.metrics = self
def before_epoch(self, learn):
for m in self.all_metrics.values(): m.reset()
self.start_time = time()
def after_epoch(self, learn):
log = {k: f'{v.compute():.3f}' for k, v in self.all_metrics.items()}
log['epoch'] = learn.epoch
log['train'] = learn.model.training
log['time'] = format_time(time() - self.start_time)
self._log(log)
def after_batch(self, learn):
x, y = learn.batch
for m in self.metrics.values():
m.update(to_cpu(learn.preds), to_cpu(y))
self.loss.update(to_cpu(learn.loss), weight=len(x))
I only modified MetricsCB
by adding one line to before_epoch
and another to after_epoch
to save time into the log.
Hopefully this helps.
I am getting the following error when executing the codepiece in notebook 10. I have note made any modifications to the code or the miniai library. I am setting up my local Conda environment in my m1 Mac, not sure if it’s because of that.
Any inputs would be appreciated
set_seed(1)
learn = fit(nn.Sequential(*cnn_layers()))
[W ParallelNative.cpp:230] Warning: Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
[W ParallelNative.cpp:230] Warning: Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
[W ParallelNative.cpp:230] Warning: Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
[W ParallelNative.cpp:230] Warning: Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[22], line 2
1 set_seed(1)
----> 2 learn = fit(nn.Sequential(*cnn_layers()))
Cell In[21], line 4, in fit(model, epochs, xtra_cbs)
1 def fit(model, epochs=1, xtra_cbs=None):
2 # setting up a high learning rate of 0.6 as an experiment
3 learn = Learner(model, dls, loss_func=F.cross_entropy, lr=0.6, cbs=cbs+fc.L(xtra_cbs))
----> 4 learn.fit(epochs)
5 return learn
File ~/fastai-2023-part2/nbs/miniai/learner.py:178, in Learner.fit(self, n_epochs, train, valid, cbs, lr)
176 if lr is None: lr = self.lr
177 if self.opt_func: self.opt = self.opt_func(self.model.parameters(), lr)
--> 178 self._fit(train, valid)
179 finally:
180 for cb in cbs: self.cbs.remove(cb)
File ~/fastai-2023-part2/nbs/miniai/learner.py:129, in with_cbs.__call__.<locals>._f(o, *args, **kwargs)
127 try:
128 o.callback(f'before_{self.nm}')
--> 129 f(o, *args, **kwargs)
130 o.callback(f'after_{self.nm}')
131 except globals()[f'Cancel{self.nm.title()}Exception']: pass
File ~/fastai-2023-part2/nbs/miniai/learner.py:166, in Learner._fit(self, train, valid)
163 @with_cbs('fit')
164 def _fit(self, train, valid):
165 for self.epoch in self.epochs:
--> 166 if train: self.one_epoch(True)
167 if valid: torch.no_grad()(self.one_epoch)(False)
File ~/fastai-2023-part2/nbs/miniai/learner.py:161, in Learner.one_epoch(self, training)
159 self.model.train(training)
160 self.dl = self.dls.train if training else self.dls.valid
--> 161 self._one_epoch()
File ~/fastai-2023-part2/nbs/miniai/learner.py:129, in with_cbs.__call__.<locals>._f(o, *args, **kwargs)
127 try:
128 o.callback(f'before_{self.nm}')
--> 129 f(o, *args, **kwargs)
130 o.callback(f'after_{self.nm}')
131 except globals()[f'Cancel{self.nm.title()}Exception']: pass
File ~/fastai-2023-part2/nbs/miniai/learner.py:156, in Learner._one_epoch(self)
154 @with_cbs('epoch')
155 def _one_epoch(self):
--> 156 for self.iter,self.batch in enumerate(self.dl): self._one_batch()
File ~/anaconda3/lib/python3.11/site-packages/fastprogress/fastprogress.py:50, in ProgressBar.__iter__(self)
48 except Exception as e:
49 self.on_interrupt()
---> 50 raise e
File ~/anaconda3/lib/python3.11/site-packages/fastprogress/fastprogress.py:41, in ProgressBar.__iter__(self)
39 if self.total != 0: self.update(0)
40 try:
---> 41 for i,o in enumerate(self.gen):
42 if self.total and i >= self.total: break
43 yield o
File ~/anaconda3/lib/python3.11/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File ~/anaconda3/lib/python3.11/site-packages/torch/utils/data/dataloader.py:1345, in _MultiProcessingDataLoaderIter._next_data(self)
1343 else:
1344 del self._task_info[idx]
-> 1345 return self._process_data(data)
File ~/anaconda3/lib/python3.11/site-packages/torch/utils/data/dataloader.py:1371, in _MultiProcessingDataLoaderIter._process_data(self, data)
1369 self._try_put_index()
1370 if isinstance(data, ExceptionWrapper):
-> 1371 data.reraise()
1372 return data
File ~/anaconda3/lib/python3.11/site-packages/torch/_utils.py:694, in ExceptionWrapper.reraise(self)
690 except TypeError:
691 # If the exception takes multiple arguments, don't try to
692 # instantiate since we don't know how to
693 raise RuntimeError(msg) from None
--> 694 raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
return self.collate_fn(data)
^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/fastai-2023-part2/nbs/miniai/datasets.py", line 27, in _f
def _f(b): return get(default_collate(b))
^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 127, in collate
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 127, in <dictcomp>
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/utils/data/_utils/collate.py", line 160, in collate_tensor_fn
storage = elem._typed_storage()._new_shared(numel, device=elem.device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/storage.py", line 866, in _new_shared
untyped_storage = torch.UntypedStorage._new_shared(size * self._element_size(), device=device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/storage.py", line 260, in _new_shared
return cls._new_using_filename_cpu(size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: torch_shm_manager at "/Users/anirudhg/anaconda3/lib/python3.11/site-packages/torch/bin/torch_shm_manager": could not generate a random directory for manager socket`Preformatted text`
type or paste code here
Hi Jeremy
I am facing an error like OSError: [WinError 127] The specified procedure could not be found while doing “from miniai.learner import *”.
any idea how to fix this?
i found the solution. in colab you have to do !pip install -Uqq git+https://github.com/fastai/course22p2 --no-cache-dir only
Hi,
I was generating a miniai project based on your notebooks and using nbdev. Weirdly, the 09 learner notebook executes perfectly fine in jupyter lab, but once I run nbdev prepare it raises an error. I already figured out with lots of trial an error with what nbdev has a problem. It is in the callback for the Progress bar the bold marked line.
class ProgressCB(Callback):
order = MetricsCB.order+1
def init(self, plot=False): self.plot = plot
def before_fit(self, learn):
learn.epochs = self.mbar = master_bar(learn.epochs)
Obviously, master_bar overwrites learn.epochs, and now nbdev says that the object has no len - I guess nbdev uses it now somehow in the learning loop and says I can’t loop as it is is not anymore of the type range? I find it weird that the code executes perfectly fine in jupyter lab and iterates through the learning loop, but nbdev has a problem.
Has anybody else experienced this?