Hey everyone,
I have been experiencing error while training my new databunch for my already trained retinanet model. The issue is I had trained my retinannet model on a data which had 3 classes while training, and i have saved the learner using learner.save. Now i want to use the weights of the already trained model for learning on my new dataset. The new dataset has 2 classes only also the number of images for training has reduced from 3k to nearly 100,can anyone explain to me how can i perform training of pre trained retinanet model.
PFA, the code for already trained model:
batch_size = 64
do_flip = True
flip_vert = True
max_rotate = 90
max_zoom = 1.1
max_lighting = 0.2
max_warp = 0.2
p_affine = 0.75
p_lighting = 0.75
tfms = get_transforms(do_flip=do_flip,
flip_vert=flip_vert,
max_rotate=max_rotate,
max_zoom=max_zoom,
max_lighting=max_lighting,
max_warp=max_warp,
p_affine=p_affine,
p_lighting=p_lighting)
train, valid = ObjectItemListSlide(train_images), ObjectItemListSlide(valid_images)
item_list = ItemLists(".", train, valid)
lls = item_list.label_from_func(lambda x: x.y, label_cls=SlideObjectCategoryList)
lls = lls.transform(tfms, tfm_y=True, size=patch_size)
data = lls.databunch(bs=batch_size, collate_fn=bb_pad_collate,num_workers=0).normalize()
backbone = "ResNet34" #["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet150"]
backbone_model = models.resnet18
if backbone == "ResNet34":
backbone_model = models.resnet34
if backbone == "ResNet50":
backbone_model = models.resnet50
if backbone == "ResNet101":
backbone_model = models.resnet101
if backbone == "ResNet150":
backbone_model = models.resnet150
pre_trained_on_imagenet = True
encoder = create_body(models.resnet34, pre_trained_on_imagenet, -2)
loss_function = "FocalLoss"
if loss_function == "FocalLoss":
crit = RetinaNetFocalLoss(anchors)
channels = 128
final_bias = -4
n_conv = 3
model = RetinaNet(encoder, n_classes=data.train_ds.c,
n_anchors=len(scales) * len(ratios),
sizes=[size[0] for size in sizes],
chs=channels, # number of hidden layers for the classification head
final_bias=final_bias,
n_conv=n_conv # Number of hidden layers
)
voc = PascalVOCMetric(anchors, patch_size, [str(i) for i in data.train_ds.y.classes[1:]])
voc
learn = Learner(data, model, loss_func=crit,
callback_fns=[BBMetrics,ShowGraph,CSVLogger,partial(GradientClipping, clip=5.0],metrics=[voc])
learn.split([model.encoder[6], model.c5top5])
learn.freeze_to(-2)
learn.lr_find()
learn.recorder.plot(suggestion=True)
plt.savefig('lr_finder_plot_bs64_400+.png')
from fastai.callbacks import SaveModelCallback
# Train with the callback function set to save weights every epoch
learn.unfreeze()
max_learning_rate = 1e-3
cyc_len = 1000
learn.fit_one_cycle(cyc_len, max_learning_rate,callbacks=[SaveModelCallback(learn, monitor='train_loss',
name='best_train_loss')])
learn.save("trained_model_1000", return_path=True,with_opt=True)
And now this is the code for loading my new databunch and learner model
batch_size = 16
do_flip = True
flip_vert = True
max_rotate = 90
max_zoom = 1.1
max_lighting = 0.2
max_warp = 0.2
p_affine = 0.75
p_lighting = 0.75
tfms = get_transforms(do_flip=do_flip,
flip_vert=flip_vert,
max_rotate=max_rotate,
max_zoom=max_zoom,
max_lighting=max_lighting,
max_warp=max_warp,
p_affine=p_affine,
p_lighting=p_lighting)
train, valid = ObjectItemListSlide(train_images) ,ObjectItemListSlide(valid_images)
item_list = ItemLists(".", train, valid)
lls = item_list.label_from_func(lambda x: x.y, label_cls=SlideObjectCategoryList)
lls = lls.transform(tfms, tfm_y=True, size=patch_size)
data = lls.databunch(bs=batch_size, collate_fn=bb_pad_collate,num_workers=0).normalize()
backbone = "ResNet34" #["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet150"]
backbone_model = models.resnet18
if backbone == "ResNet34":
backbone_model = models.resnet34
if backbone == "ResNet50":
backbone_model = models.resnet50
if backbone == "ResNet101":
backbone_model = models.resnet101
if backbone == "ResNet150":
backbone_model = models.resnet150
pre_trained_on_imagenet = True
encoder = create_body(backbone_model, pre_trained_on_imagenet, -2)
loss_function = "FocalLoss"
if loss_function == "FocalLoss":
crit = RetinaNetFocalLoss(anchors)
channels = 128
final_bias = -4
n_conv = 3
model = RetinaNet(encoder, n_classes=data.train_ds.c,
n_anchors=len(scales) * len(ratios),
sizes=[size[0] for size in sizes],
chs=channels, # number of hidden layers for the classification head
final_bias=final_bias,
n_conv=n_conv # Number of hidden layers
)
voc = PascalVOCMetric(anchors, patch_size, [str(i) for i in data.train_ds.y.classes[1:]])
voc
learn = Learner(data, model, loss_func=crit,
callback_fns=[BBMetrics,ShowGraph,CSVLogger,partial(GradientClipping, clip=5.0)],metrics=[voc])
learn.split([model.encoder[6], model.c5top5])
learn.freeze_to(-2)
#here I am loading the learner model using the already saved learner model using #torch.save(learn.model,PATH)
learn.model=torch.load('/content/drive/MyDrive/model.pt',map_location=torch.device('cpu'))
#learn.model.eval()
max_learning_rate = 1e-3
cyc_len = 50
batch_size=64
learn.fit_one_cycle(cyc_len, max_learning_rate,callbacks=[SaveModelCallback(learn, monitor='train_loss', name='best_train_loss')])
I am getting this error while learn.fit:
Starting Training with n= 50 epochs with batch_size= 64
0.00% [0/50 00:00<?]
epoch train_loss valid_loss pascal_voc_metric BBloss focal_loss AP-Mitosis time
16.67% [1/6 00:01<00:09]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-27-2cf0edfb1663> in <module>
4 print("\n Starting Training with n=",cyc_len,"epochs with batch_size=",batch_size,"\n")
5 learn.fit_one_cycle(cyc_len, max_learning_rate,callbacks=[SaveModelCallback(learn, monitor='train_loss',
----> 6 name='best_train_loss')])
7 frames
/usr/local/lib/python3.7/dist-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
21 callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
22 final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 23 learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
24
25 def fit_fc(learn:Learner, tot_epochs:int=1, lr:float=defaults.lr, moms:Tuple[float,float]=(0.95,0.85), start_pct:float=0.72,
/usr/local/lib/python3.7/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
198 else: self.opt.lr,self.opt.wd = lr,wd
199 callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
--> 200 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
201
202 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
/usr/local/lib/python3.7/dist-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
104 if not cb_handler.skip_validate and not learn.data.empty_val:
105 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
--> 106 cb_handler=cb_handler, pbar=pbar)
107 else: val_loss=None
108 if cb_handler.on_epoch_end(val_loss): break
/usr/local/lib/python3.7/dist-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
61 if not is_listy(yb): yb = [yb]
62 nums.append(first_el(yb).shape[0])
---> 63 if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
64 if n_batch and (len(nums)>=n_batch): break
65 nums = np.array(nums, dtype=np.float32)
/usr/local/lib/python3.7/dist-packages/fastai/callback.py in on_batch_end(self, loss)
306 "Handle end of processing one batch with `loss`."
307 self.state_dict['last_loss'] = loss
--> 308 self('batch_end', call_mets = not self.state_dict['train'])
309 if self.state_dict['train']:
310 self.state_dict['iteration'] += 1
/usr/local/lib/python3.7/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
248 "Call through to all of the `CallbakHandler` functions."
249 if call_mets:
--> 250 for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
251 for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
252
/usr/local/lib/python3.7/dist-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
239 def _call_and_update(self, cb, cb_name, **kwargs)->None:
240 "Call `cb_name` on `cb` and update the inner state."
--> 241 new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
242 for k,v in new.items():
243 if k not in self.state_dict:
/usr/local/lib/python3.7/dist-packages/object_detection_fastai/callbacks/callbacks.py in on_batch_end(self, last_output, last_target, **kwargs)
153 num_boxes = len(bbox_gt) * 3
154 for box, cla, scor in list(zip(bbox_pred, preds, scores))[:num_boxes]:
--> 155 temp = BoundingBox(imageName=str(self.imageCounter), classId=self.metric_names_original[cla], x=box[0], y=box[1],
156 w=box[2], h=box[3], typeCoordinates=CoordinatesType.Absolute, classConfidence=scor,
157 bbType=BBType.Detected, format=BBFormat.XYWH, imgSize=(self.size, self.size))
IndexError: list index out of range
Can anyone tell how to resolve this issue , or what might be causing this issue?
P.S. I have tried loading the model using torch.save_dict and loading through torch.load_state_dict it gives me classifier weight mismatch error(which is obvious):
learn.model.load_state_dict(torch.load("/content/drive/MyDrive/myModel.pth",map_location=torch.device('cpu')))
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-56-229e34fc2c49> in <module>
---> 46 learn.model.load_state_dict(torch.load("/content/drive/MyDrive/myModel.pth",map_location=torch.device('cpu')))
47 #learn.model.eval()
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict)
1496 if len(error_msgs) > 0:
1497 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-> 1498 self.__class__.__name__, "\n\t".join(error_msgs)))
1499 return _IncompatibleKeys(missing_keys, unexpected_keys)
1500
RuntimeError: Error(s) in loading state_dict for RetinaNet:
size mismatch for classifier.3.weight: copying a param with shape torch.Size([3, 128, 3, 3]) from checkpoint, the shape in current model is torch.Size([2, 128, 3, 3]).
size mismatch for classifier.3.bias: copying a param with shape torch.Size([3]) from checkpoint, the shape in current model is torch.Size([2]).
Thanks everyone for this wonderful forum.