Fastai v2 chat

Is there any way of preventing train from computing the losses??

This could be useful with models that return lossses at training and not at inference.

Not sure what you mean here. At inference (e.g. predict) there will be no ā€˜lossā€™ since there is no target?

Yijin

I have a model that returns the losses computed so, I donā€™t want one_batch to execute
self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')

Because the model in training model return losses and in validation mode returns predictions.

I am trying to redefine the learner as follows:

class Mask_RCNN_Learner(Learner):
    def __init__(self, dls, model, loss_func=None, opt_func=Adam, lr=defaults.lr, splitter=trainable_params, cbs=None,
                 metrics=None, path=None, model_dir='models', wd=None, wd_bn_bias=False, train_bn=True,
                 moms=(0.95,0.85,0.95)):
        super().__init__(dls, model, loss_func, opt_func, lr, splitter, cbs,
                 metrics, path, model_dir, wd, wd_bn_bias, train_bn,
                 moms)
      
    def _split(self, b):
        i = getattr(self.dls, 'n_inp', 1 if len(b)==1 else len(b)-1)
        self.xb,self.yb = b[:i],b[i:]
    
    def _do_epoch_train(self):
        try:
            self.dl = self.dls.train;                                     
            
            # Modification
            self.n_iter = len(self.dl)
            for o in enumerate(self.dl):
                i, b = *o
                self.iter = i
                try:
                    self._split(b)                      
                    loss_dict = self.model(*self.xb,*self.yb)           
                    if len(self.yb) == 0: return
                    self.loss = sum(loss for loss in loss_dict.values())
                    if not self.training: return
                    self.loss.backward()  
                    self.opt.step()                                     
                    self.opt.zero_grad()
                except CancelBatchException as e:
                    raise e   
        except CancelTrainException as e:
            raise e

    def _do_epoch_validate(self, ds_idx=1, dl=None):
        if dl is None: dl = self.dls[ds_idx]
        try:
            self.dl = dl;                                                 
            with torch.no_grad():
                # Modification
                self.n_iter = len(self.dl)
                for o in enumerate(self.dl):
                    i, b = *o
                    self.iter = i
                    try:
                        self._split(b)                   
                        detection = self.model(*self.xb);               
                        self.loss =  self.loss_func(detection, *self.yb)
                        
                        # COMPUTING METRICS
                        
                        if not self.training: return
                    except CancelBatchException as e:
                        raise e
        except CancelValidException as e:
            raise e                                                  
    
    @log_args(but='cbs')
    def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
        with self.added_cbs(cbs):
            if reset_opt or not self.opt: self.create_opt()
            if wd is None: wd = self.wd
            if wd is not None: self.opt.set_hypers(wd=wd)
            self.opt.set_hypers(lr=self.lr if lr is None else lr)

            try:
                self._do_begin_fit(n_epoch)
                for epoch in range(n_epoch):
                    try:
                        self.epoch=epoch         
                        self._do_epoch_train()
                        self._do_epoch_validate()
                    except CancelEpochException as e:
                        raise e                       

            except CancelFitException as e:
                raise e 

However, I am getting:

  File "<ipython-input-39-0a0657f193c1>", line 23
    self._split(b)
              ^
SyntaxError: can't use starred expression here

The error message points to your use of *, in the line i, b = *o. This SO page explains it, I think? You should try changing that line to i, b = o, or delete that line and just change the line above to for i, b in enumerate(self.dl):

Not sure how all these relate to your question about not computing losses ā€“ I did not read through your code, and donā€™t know whatā€™s happening in itā€¦!

Good luck.

Yijin

I have solved it, however donā€™t know pretty well how.

class Mask_RCNN_Learner(Learner):
    def __init__(self, dls, model, loss_func=None, opt_func=Adam, lr=defaults.lr, splitter=trainable_params, cbs=None,
                 metrics=None, path=None, model_dir='models', wd=None, wd_bn_bias=False, train_bn=True,
                 moms=(0.95,0.85,0.95)):
        super().__init__(dls, model, loss_func, opt_func, lr, splitter, cbs,
                 metrics, path, model_dir, wd, wd_bn_bias, train_bn,
                 moms)
      
    def all_batches(self):
        self.n_iter = len(self.dl)
        for o in enumerate(self.dl): self.one_batch(*o)

    def one_batch(self, i, b):
        self.iter = i
        try:
            self._split(b);                                  self('begin_batch')
            loss_dict = self.model(*self.xb,self.yb);       self('after_pred')
            if len(self.yb) == 0: return
            loss = sum(loss for loss in loss_dict.values())
            self.loss = loss;                                self('after_loss')
            if not self.training: return
            self.loss.backward();                            self('after_backward')
            self.opt.step();                                 self('after_step')
            self.opt.zero_grad()
        except CancelBatchException:                         self('after_cancel_batch')
        finally:                                             self('after_batch')

    def _do_begin_fit(self, n_epoch):
        self.n_epoch,self.loss = n_epoch,tensor(0.);         self('begin_fit')

    def _do_epoch_train(self):
        try:
            self.dl = self.dls.train;                        self('begin_train')
            self.all_batches()
        except CancelTrainException:                         self('after_cancel_train')
        finally:                                             self('after_train')

    def _do_epoch_validate(self, ds_idx=1, dl=None):
        if dl is None: dl = self.dls[ds_idx]
        try:
            self.dl = dl;                                    self('begin_validate')
            with torch.no_grad(): self.all_batches()
        except CancelValidException:                         self('after_cancel_validate')
        finally:                                             self('after_validate')                                              
    
    @log_args(but='cbs')
    def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
        with self.added_cbs(cbs):
            if reset_opt or not self.opt: self.create_opt()
            if wd is None: wd = self.wd
            if wd is not None: self.opt.set_hypers(wd=wd)
            self.opt.set_hypers(lr=self.lr if lr is None else lr)

            try:
                self._do_begin_fit(n_epoch)
                for epoch in range(n_epoch):
                    try:
                        self.epoch=epoch;          self('begin_epoch')
                        self._do_epoch_train()
                        self._do_epoch_validate()
                    except CancelEpochException:   self('after_cancel_epoch')
                    finally:                       self('after_epoch')

            except CancelFitException:             self('after_cancel_fit')
            finally:                               self('after_fit')   

If you look, I am just adjusting these lines of code:

loss_dict = self.model(*self.xb,self.yb);       self('after_pred')
if len(self.yb) == 0: return
loss = sum(loss for loss in loss_dict.values())
self.loss = loss;                                self('after_loss')

This is done because i am working with torchvision.models.detection.maskrcnn-resnet50_fpn. This model expects as input and image an a dict with the target.

The thing is that in evaluation it return a dict with a mask, boxes and labels.

I would like the accuracy metrics to be calculated just in the mask.

Thatā€™s why I was asking where to modify the data passed into metrics. The output of this model is not a usual one

Does image should look visually similar before and after normalization?
Iā€™ve created dataloaders with no augmentations, so the images came out of dls.one_batch() have been just transformed to float tensor.

xb,_ = dls.one_batch()
norm = Normalize.from_stats(*imagenet_stats)
xb_n = norm(xb)
show_image(xb_n[0])

Then I applied the Normalize transform and viewed the image, it looks totally distorted ā€” some portion of image have been masked and while the visible portion seems like gone through major brightness/contrast change. Iā€™ve also calculated my own statistics of dataset and tried to Normalize using that, but images look equally distorted.

I have seen in the code many annotations like @patch and @typedispatch.

What are they doing this annotations??

  • @patch allows you to write extension functions to predefined types. For instance, thereā€™s a patched implementation of Principal Component Analysis for torch.Tensor. Thus, you can simply call it as a member function of tensor. Imagine x is a 2D tensor storing some embeddings, you can call x.pca(2) to get 2 principal components of that Tensor.

  • @typedispatch enables dynamic dispatch of functions. We can overload a function with a behavior specific to a type and itā€™ll act on those types only. A good example of this would be IntToFloatTensor transform:

class IntToFloatTensor(Transform):
    "Transform image to float tensor, optionally dividing by 255 (e.g. for images)."
    order = 10 #Need to run after PIL transforms on the GPU
    def __init__(self, div=255., div_mask=1): store_attr(self, 'div,div_mask')
    def encodes(self, o:TensorImage): return o.float().div_(self.div)
    def encodes(self, o:TensorMask ): return o.long() // self.div_mask
    def decodes(self, o:TensorImage): return ((o.clamp(0., 1.) * self.div).long()) if self.div else o

The encodes method have 2 different implementations for TensorImage and TensorMask, thus IntToFloatTensor will behave differently for TensorImage and TensorMask.

I believe all these annotations are meta-programming aspects of fastai, which are independent of any Deep Learning framework. The library is called fastcore and you can learn more about it here.

2 Likes

Looks interesting.

Could it be used for overriding predefined methods with same parameters?

I guess youā€™re looking for a way to combine the both. Iā€™m not sure if I understood correctly, could you tell us more about your use-case?

I would like to use @patch for overriding fastai2.learner.Recorder.after_batch method easily.

If I create a new Recorder class I need to modify the line where learner extracts Callbacks array. So, subclassing Learner is not possibble!

If I create a subclass of fastai2.data.load.DataLoader(GetAttr) for changing this line of code and add a collate that doesnā€™t stack items:

def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)

Is it possible to link DataBlock.dataloaders to this new subclass? Or do I need to create new DataBlock subclass linked with Dataset subclass and with DataLoaders subclass?

However, I see something strange DataLoader is defined in data.core and data.load

I hope this is the right place for a short question:
I need to display data in a form: x,y = ((tensor, tensor), (tensor)). I generate it as shown in the previous expression, the whole item at a time. I @typedispatch show_batch() and I get the following error: AssertionError: Match length mismatch (full stacktrace at the end of the post).

I declared my own tuple (that does nothing):

class HeatingTuple(Tuple):
    pass

and tried to write the show_batch() (and show_results() for that matter):
def show_results(x, y, samples:HeatingTuple, . . .
def show_results(x:HeatingTuple, y, samples, . . .
to no avail. Calling dls.show_batch() gives me the error. Manually running the show_batch works. (after I remove the annotation)

DataBlock is using an ItemTransform.
DataBlock(item_tfms=HeatingItemiser(future_len))
HeatingItemiser.encodes() returns a tuple and decodes() a HeatingTuple()

This is the error from dls.show_batch():

AssertionError                            Traceback (most recent call last)
<ipython-input-12-785270320dfd> in <module>
     56 seq_dloaders = seq_block.dataloaders(samples, bs=8)
     57 # onebatch = seq_dloaders.one_batch()
---> 58 seq_dloaders.show_batch()
     59 # show_batch(None, None, samples=onebatch)
     60 # print(f"Data shapes: x[0] {onebatch[0][0].shape}, x[1] {onebatch[0][1].shape} y{onebatch[1].shape}")

~/work/installs/fastai2/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
     97         if b is None: b = self.one_batch()
     98         if not show: return self._pre_show_batch(b, max_n=max_n)
---> 99         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
    100         if unique: self.get_idxs = old_get_idxs
    101 

~/work/installs/fastai2/fastai2/data/core.py in _pre_show_batch(self, b, max_n)
     87         b = self.decode(b)
     88         if hasattr(b, 'show'): return b,None,None
---> 89         its = self._decode_batch(b, max_n, full=False)
     90         if not is_listy(b): b,its = [b],L((o,) for o in its)
     91         return detuplify(b[:self.n_inp]),detuplify(b[self.n_inp:]),its

~/work/installs/fastai2/fastai2/data/core.py in _decode_batch(self, b, max_n, full)
     81         f = self.after_item.decode
     82         f = compose(f, partial(getattr(self.dataset,'decode',noop), full = full))
---> 83         return L(batch_to_samples(b, max_n=max_n)).map(f)
     84 
     85     def _pre_show_batch(self, b, max_n=9):

~/work/installs/fastcore/fastcore/foundation.py in map(self, f, *args, **kwargs)
    373              else f.format if isinstance(f,str)
    374              else f.__getitem__)
--> 375         return self._new(map(g, self))
    376 
    377     def filter(self, f, negate=False, **kwargs):

~/work/installs/fastcore/fastcore/foundation.py in _new(self, items, *args, **kwargs)
    324     @property
    325     def _xtra(self): return None
--> 326     def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
    327     def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
    328     def copy(self): return self._new(self.items.copy())

~/work/installs/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     39             return x
     40 
---> 41         res = super().__call__(*((x,) + args), **kwargs)
     42         res._newchk = 0
     43         return res

~/work/installs/fastcore/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
    315         if items is None: items = []
    316         if (use_list is not None) or not _is_array(items):
--> 317             items = list(items) if use_list else _listify(items)
    318         if match is not None:
    319             if is_coll(match): match = len(match)

~/work/installs/fastcore/fastcore/foundation.py in _listify(o)
    251     if isinstance(o, list): return o
    252     if isinstance(o, str) or _is_array(o): return [o]
--> 253     if is_iter(o): return list(o)
    254     return [o]
    255 

~/work/installs/fastcore/fastcore/foundation.py in __call__(self, *args, **kwargs)
    217             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    218         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 219         return self.fn(*fargs, **kwargs)
    220 
    221 # Cell

~/work/installs/fastcore/fastcore/utils.py in _inner(x, *args, **kwargs)
    346     if order is not None: funcs = funcs.sorted(order)
    347     def _inner(x, *args, **kwargs):
--> 348         for f in L(funcs): x = f(x, *args, **kwargs)
    349         return x
    350     return _inner

~/work/installs/fastai2/fastai2/data/core.py in decode(self, o, full)
    294     def __iter__(self): return (self[i] for i in range(len(self)))
    295     def __repr__(self): return coll_repr(self)
--> 296     def decode(self, o, full=True): return tuple(tl.decode(o_, full=full) for o_,tl in zip(o,tuplify(self.tls, match=o)))
    297     def subset(self, i): return type(self)(tls=L(tl.subset(i) for tl in self.tls), n_inp=self.n_inp)
    298     def _new(self, items, *args, **kwargs): return super()._new(items, tfms=self.tfms, do_setup=False, **kwargs)

~/work/installs/fastcore/fastcore/utils.py in tuplify(o, use_list, match)
    132 def tuplify(o, use_list=False, match=None):
    133     "Make `o` a tuple"
--> 134     return tuple(L(o, use_list=use_list, match=match))
    135 
    136 # Cell

~/work/installs/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     39             return x
     40 
---> 41         res = super().__call__(*((x,) + args), **kwargs)
     42         res._newchk = 0
     43         return res

~/work/installs/fastcore/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
    319             if is_coll(match): match = len(match)
    320             if len(items)==1: items = items*match
--> 321             else: assert len(items)==match, 'Match length mismatch'
    322         super().__init__(items)
    323 

AssertionError: Match length mismatch

Thank you!

1 Like

it seems that a fresh install doesnt necesarily install correct cuda version

I installed as http://dev.fast.ai/#Installing

but later I see that torch.cuda.is_available() returned false, but I have cudaā€¦ so I went to the page of pytorch and grab their command line

conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/tyoc213/miniconda3/envs/fastai2

  added / updated specs:
    - cudatoolkit=10.1
    - pytorch
    - torchvision


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cudatoolkit-10.1.243       |       h6bb024c_0       347.4 MB
    pytorch-1.5.0              |py3.7_cuda10.1.243_cudnn7.6.3_0       399.5 MB  pytorch
    torchvision-0.6.0          |       py37_cu101        11.8 MB  pytorch
    ------------------------------------------------------------
                                           Total:       758.7 MB

The following packages will be DOWNGRADED:

  cudatoolkit                            10.2.89-hfd86e86_1 --> 10.1.243-h6bb024c_0
  pytorch              1.5.0-py3.7_cuda10.2.89_cudnn7.6.5_0 --> 1.5.0-py3.7_cuda10.1.243_cudnn7.6.3_0
  torchvision                              0.6.0-py37_cu102 --> 0.6.0-py37_cu101


Proceed ([y]/n)? y


Downloading and Extracting Packages
pytorch-1.5.0        | 399.5 MB  | ########################################################### | 100% 
cudatoolkit-10.1.243 | 347.4 MB  | ########################################################### | 100% 
torchvision-0.6.0    | 11.8 MB   | ########################################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Like you see it installed incorrect cuda version, here it is 10.1 not 102, now torch.cuda.is_available() is true.

I dont know if this was just a failure or why it installed those other libs. Or is there a way that it dectects the correct ones on install? if not, ppl will think some things are ā€œslowā€ because using CPU.

Hi everyone,
In some problems, for efficient storing, we can use lmdb to store our dataset with e.g. millions of images in just 1 or 2 files, like in Pytorch LSUN dataset or in most Scene text recognition problems.
Iā€™m facing an issue when using a lmdb dataset with fastai2. Everything works fine except the export of the trained Learner, like learner.export(). The problem is when we use lmdb datasets, it opens an environment and then return indexed image, labels,ā€¦ from that as arrays of bytes (see this example in Pytorch LSUN dataset). When a Learner uses DataLoaders created from a lmdb dataset, it canā€™t be exported, and gives typeError: can't pickle Environment objects, which refers to the environment that was opened.
Is there anything I can do to make the .export() work? Thanks.

Hi,

I am continuing to explore Siamese network in fastai2. I am trying to ā€˜hookā€™ in GradCAM for a Siamese model, at the ā€˜encoderā€™ layer, but I am not sure how to extract the activations output and gradient for both ā€˜passesā€™ of the two images that form the Siamese image pairā€¦?

From the notebook, as per definition the Siamese modelā€™s forward pass calls the same ā€˜encoderā€™ twice, on the two images, and concatenate their output together before calling ā€˜headā€™.

I have defined my hooks:

class Hook():
    def __init__(self, m):
        self.hook = m.register_forward_hook(self.hook_func)   
    def hook_func(self, m, i, o): self.stored = o.detach().clone()
    def __enter__(self, *args): return self
    def __exit__(self, *args): self.hook.remove()

class HookBwd():
    def __init__(self, m):
        self.hook = m.register_backward_hook(self.hook_func)   
    def hook_func(self, m, gi, go): self.stored = go[0].detach().clone()
    def __enter__(self, *args): return self
    def __exit__(self, *args): self.hook.remove()

And I created a test image-pair, and applied the transforms using the defined dataloaders:

img1 = PILImage.create(Path('/path/to/image1.jpg'))
img2 = PILImage.create(Path('/path/to/image2.jpg'))
siamtest = SiameseImage(img1, img2)

tdl = learn.dls.test_dl([siamtest])
x = tdl.one_batch()

And then I called the hooks and eval the model:

cls = 1
with HookBwd(learn.model.encoder) as hookg:
    with Hook(learn.model.encoder) as hook:
        output = learn.model.eval()(x[0],x[1])
        act = hook.stored[0].cpu()
    output[0,cls].backward()
    grad = hookg.stored[0].cpu()

But this will only get me 1x act for the activations and 1x grad for the gradients, presumably from the second (i.e. final) call of encoder on the second image x[1]. I am not sure how to make it output two different sets of act and grad, for the encoder pass of x[0] and x[1] respectively.

Thoughts and comments welcome. Thank you.

Regards,
Yijin

Is it better to early stopping at loss or at precission metric?

What do you think of next approach:

Compare the valid loss between 2 models at validation and selecting the best hyperparameters based upon this.

With this hyperparameters train a model that early stop when accuracy is decreasing.

How to create dataloader for multiple outputs?

@Transform
def get_x(x): return f"{path}\\train_images\\{x[4]}"
@Transform
def get_arrays(x):
    return [rle_decode(x[0],(xs,ys)),
            rle_decode(x[1],(xs,ys)),
            rle_decode(x[2],(xs,ys)),
            rle_decode(x[3],(xs,ys))]
@Transform
def array_to_mask(x):
    return (PILMask.create(x[0]),PILMask.create(x[1]),PILMask.create(x[2]),PILMask.create(x[3]))
batch_tfms=[*aug_transforms(size=(xs//7,ys//7)), Normalize.from_stats(*imagenet_stats)]

dsets=Datasets(train_df_t,tfms=[[get_x,PILImage.create],[get_arrays,array_to_mask]],splits=splits)``
dsets.valid[0]

gives me

(PILImage mode=RGB size=2100x1400,
 (PILMask mode=I size=2100x1400,
  PILMask mode=I size=2100x1400,
  PILMask mode=I size=2100x1400,
  PILMask mode=I size=2100x1400))

but

dls = dsets.dataloaders(bs=12,after_item=[ToTensor],before_batch=[IntToFloatTensor, Normalize.from_stats(*imagenet_stats)])
b=dls.one_batch()
len(b),len(b[0]),len(b[1])

gives me
(2, 12, 4)

I expect (2,12,12) but y has only one sample instead of a batch of sample.

Are those 4 masks you getting in b[1] of only one sample ?

Considering those are PILMasks, donā€™t you want to stack them up in a single Tensor? like a Tensor of shape (4,2100,1400) ?

Thereā€™s one parameter n_inp for Datasets, have you tried setting that to 1 explicitly?

1 Like

Do setups in Transform is called only once in case of DataLoaders? Normalize uses setups to calculate mean and std if theyā€™re None. Does this mean we only calculate mean and std of single batch and use that for the entire dataset?