VGG16 fine tuning: low accuracy

Hi,

I have a code posted in this SO question which tries to apply fine-tuning to cats/dogs dataset. But somehow, it doesn’t work as needed. Fine-tuned model has a very low accuracy (around 50%). Therefore, probably I’ve done something wrong.

Therefore, my question is, could someone help me to figure out what’s going wrong with the code? I know there are a lot of examples in blogs/forums and this course’s materials on how to fine tune models. But I want to understand why my code (which looks like a quite straightforward sequence of actions to me) doesn’t work and figure out how to do fine-tuning in a right way.

Thank you.

1 Like

Hi @devforfu, How were you able to load VGG pre-trained weights if the image is resized to (244,244).The original VGG16 expects 224x224 right ?

Well, that’s awkward. Let me try to fix this. I’ll update my code and return back with results.

Ok, here is an updated source code, in a single chunk:

import warnings
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', DeprecationWarning)

from __future__ import print_function
from itertools import izip_longest as zip_longest
from pprint import pformat as pf
from pprint import pprint as pp
import os

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.layers import Conv2D, MaxPooling2D, MaxPooling2D, ZeroPadding2D
from keras.layers import Dropout, Flatten, Dense, InputLayer, Lambda
from keras.models import Sequential, Model, load_model
from keras.utils.data_utils import get_file
from keras.optimizers import SGD
import keras.backend as K

import numpy as np


RANDOM_STATE = 1
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
BATCH_SIZE = 4
VGG_MEAN = np.array([123.68, 116.779, 103.939]).reshape((3, 1, 1))
VGG16_WEIGHTS_PATH = 'http://www.platform.ai/models/vgg16.h5'
DATA_ROOT = os.path.join(os.path.expanduser('~'), 'data', 'dogscats')
TRAIN_DIR = os.path.join(DATA_ROOT, 'train')
VALID_DIR = os.path.join(DATA_ROOT, 'valid')
SAMPLES_DIR = os.path.expanduser('~/dogscats_samples')


np.random.seed(RANDOM_STATE)
K.set_image_dim_ordering('th')


def get_batches(dirname, gen=ImageDataGenerator(), shuffle=True, 
                batch_size=BATCH_SIZE, class_mode='categorical'):
    return gen.flow_from_directory(
        os.path.join(SAMPLES_DIR, dirname),
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
        class_mode=class_mode,
        shuffle=shuffle,
        batch_size=batch_size)

def vgg_preprocess(x):
    x = x - VGG_MEAN
    return x[:, ::-1]

def conv_block(model, n_layers, n_filters, name='block'):
    for i in range(n_layers):
        model.add(ZeroPadding2D((1, 1), name='%s_padding_%s' % (name, i)))
        model.add(Conv2D(n_filters, (3, 3), activation='relu', name='%s_conv2d_%s' % (name, i)))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='%s_maxpool' % name))
    
def fc_block(model, name='block'):
    model.add(Dense(4096, activation='relu', name=name + '_dense'))
    model.add(Dropout(0.5))
    
def build_vgg_16():
    model = Sequential()
    input_shape = (3, IMAGE_WIDTH, IMAGE_HEIGHT) 
    model.add(InputLayer(input_shape=input_shape))
    model.add(Lambda(vgg_preprocess))
    conv_block(model, n_layers=2, n_filters=64, name='block1')
    conv_block(model, n_layers=2, n_filters=128, name='block2')
    conv_block(model, n_layers=3, n_filters=256, name='block3')
    conv_block(model, n_layers=3, n_filters=512, name='block4')
    conv_block(model, n_layers=3, n_filters=512, name='block5')
    model.add(Flatten())
    fc_block(model)
    fc_block(model)
    model.add(Dense(1000, activation='softmax'))
    return model

def train_finetuned_model():
    file_path = get_file('vgg16.h5', VGG16_WEIGHTS_PATH, cache_subdir='models')
    print('Building VGG16 (no-top) model to generate bottleneck features')
    vgg16_notop = build_vgg_16()
    vgg16_notop.load_weights(file_path)
    for _ in range(6):
        vgg16_notop.pop()
    vgg16_notop.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

    train_batches = get_batches('train', shuffle=False, class_mode=None)
    train_labels = np.array([0]*1000 + [1]*1000)
    bottleneck_train = vgg16_notop.predict_generator(train_batches, steps=2000 // BATCH_SIZE)
    
    valid_batches = get_batches('valid', shuffle=False, class_mode=None)
    valid_labels = np.array([0]*400 + [1]*400)
    bottleneck_valid = vgg16_notop.predict_generator(valid_batches, steps=800 // BATCH_SIZE)
    
    print('Training top model on bottleneck features')
    top_model = Sequential()
    top_model.add(Flatten(input_shape=bottleneck_train.shape[1:]))
    top_model.add(Dense(256, activation='relu'))
    top_model.add(Dropout(0.5))
    top_model.add(Dense(1, activation='sigmoid'))
    top_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    top_model.fit(bottleneck_train, train_labels, 
                  batch_size=32, epochs=50, 
                  validation_data=(bottleneck_valid, valid_labels))

    print('Concatenate new VGG16 (without top layer) with pretrained top model')
    vgg16_fine = build_vgg_16()    
    vgg16_fine.load_weights(file_path)
    for _ in range(6):
        vgg16_fine.pop()
    vgg16_fine.add(Flatten(name='top_flatten'))    
    vgg16_fine.add(Dense(256, activation='relu', name='top_dense'))
    vgg16_fine.add(Dropout(0.5, name='top_dropout'))
    vgg16_fine.add(Dense(1, activation='sigmoid', name='top_sigmoid'))
    for i, layer in enumerate(reversed(top_model.layers), 1):
        pretrained_weights = layer.get_weights()
        vgg16_fine.layers[-i].set_weights(pretrained_weights)
    for layer in vgg16_fine.layers[:26]:
        layer.trainable = False
    vgg16_fine.compile(optimizer=SGD(lr=1e-4, momentum=0.9),
                       loss='binary_crossentropy',
                       metrics=['accuracy'])
    
    print('Train concatenated model on dogs/cats dataset sample')
    train_datagen = ImageDataGenerator(rescale=1./255,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True)
    test_datagen = ImageDataGenerator(rescale=1./255)
    train_batches = get_batches('train', gen=train_datagen, class_mode='binary')
    valid_batches = get_batches('valid', gen=test_datagen, class_mode='binary')
    vgg16_fine.fit_generator(train_batches,
                             steps_per_epoch=2000 // BATCH_SIZE,
                             epochs=50,
                             validation_data=valid_batches,
                             validation_steps=800 // BATCH_SIZE)
    return vgg16_fine


final_model = train_finetuned_model()

No any other steps applied. And this exact code has a very small accuracy, i.e. VGG16 without top layers connected with FC model pre-trained on bottleneck features.

Your fully connected layers look totally different from the original VGG architecture.

# yours
Flatten()
Dense(256, activation='relu')
Dense(1, activation='sigmoid')

# original
Flatten()
Dense(4096, activation='relu')
Dense(4096, activation='relu')
Dense(2, activation='softmax')

Two points.

  1. The last layer should be 2-class-softmax instead of sigmoid. The accuracy is not computed as you expect if you use sigmoid, I guess.
  2. Complexity (number of neurons and layers) seems to be too low.

I also posted the same answer in StackOverflow.

Ok, I see. It seems that fine-tuning is not as straightforward as I’ve thought at the beginning. I’ll try your advise and return back with results.

Unfortunately, now I have another problem - now the fully-connected model trained on bottleneck features has low accuracy, i.e. with previous model this custom top network had ~98% accuracy, but when I’ve changes number of neurons and switched to softmax, the accuracy dropped to the same 50%.

Here is a fragment that I’ve changed to follow your advice, i.e. new network architecture and softmax instaed of sigmoid:

train_labels = to_categorical(train_labels)
valid_labels = to_categorical(valid_labels)

print('Training top model on bottleneck features')
top_model = Sequential()
top_model.add(Flatten(input_shape=bottleneck_train.shape[1:]))
top_model.add(Dense(4096, activation='relu'))
top_model.add(Dense(4096, activation='relu'))
top_model.add(Dense(2, activation='softmax'))
top_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
top_model.fit(bottleneck_train, train_labels, 
              batch_size=32, epochs=10, 
              validation_data=(bottleneck_valid, valid_labels))

And here is a result of training:

Training top model on bottleneck features
Train on 2000 samples, validate on 800 samples
Epoch 1/10
2000/2000 [==============================] - 4s - loss: 8.0072 - acc: 0.4955 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 2/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 3/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 4/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 5/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 6/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 7/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 8/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 9/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000
Epoch 10/10
2000/2000 [==============================] - 4s - loss: 8.0151 - acc: 0.5000 - val_loss: 8.0151 - val_acc: 0.5000

Not sure what’s going wrong. Probably something with training examples or labels.

Hi @devforfu,
I think it’s a mismatch with train data and labels

When you do
bottleneck_train = vgg16_notop.predict_generator(train_batches, steps=2000 // BATCH_SIZE)

it doesn’t give 1000 cats and 1000 dogs.It gives you 2000 cats

That’s why you are getting 50% accuracy.It is able to predict first half of them accurately but failing for the later half because of wrong labels

@rteja1113 Yes, that is what I am thinking about. I’ve decided to verify data I have and try again.

Will return back with new results.

I was getting very similar results on State Farm, with my predictions stuck at 10% accuracy with the 10 classes. (Equivalent to your 50% at dogscats with 2 classes.)

My model just would not train. Loss rate didn’t improve, and accuracy was stuck at the minimum.
I finally made progress when I reduced the learning rate…
Dividing it by 10 helped. Dividing it by 100 helped tremendously. I went from 10% accuracy to near 80%.

With Keras RMSprop, the default lr is 0.001. I found best results around lr=0.00001.

I believe what is happening here is that the search surface is very complex. Higher learning rates jump the weights too far, across multiple peaks in the search area. This means that there is no effective loss formula relationship between each training iteration, preventing the optimizer from ever heading toward a local optima.

I would love to hear from experienced ML coders to see if my explanation is on the right track.

I am experiencing a similar problem where my loss stays constant and my binary (0,1) prediction accuracy is always 50%. My model seems to just always predict 0, so half are predicted correctly and half incorrectly. My input data is always fed into the network in even sample numbers so ever 128 batch has 64 true and 64 false images.

Any ideas on what the issue might be? My model can be seen at https://gist.github.com/system123/905e1dcdcb201ac6cb08d6b303364478

Hi @System123, the model is good, can you show how you created the training set and validation set

Hi,

The dataset is created by generating triplets of (image1, image2, y) where img1 is selected first and then img2 is selected to ensure it corresponds to img1 and the label is set to 1. Then a second sample is added to the dataset (img1, img3, y) where img3 is selected to ensure it isn’t similar to img1 or img2, and the label is set to 0.

So the target vector is interleaved [1,0,1,0,1…] so their are equal positive and negative results. This dataset is then split into training, validation and testing datasets. All contain equal positive and negative results.

The output from training is here: https://gist.github.com/system123/7ac32dce097c2148ab5dde451a528dea

You will see that Keras says it gets 98% accuracy when testing using the testing dataset, but using sklearn metrics and you can see the network only ever predicts zero even when I run the training data through the model using predict_generator.

I have confirmed that the dataset is correctly being generated in batches by displaying the images side by side. I also normalise all the images to values between [0 and 1] and then subtract the mean.

I think there is an issue with the Keras accuracy metric, as there is no way that it can get such high accuracy, and even when I check the trained network it ends up only 50% accurate, while Keras says it is over 98% accurate.

Hi @System123, what does model.predict() return ?

It returns an array of predicted values, all of which are 0. Hence it only predicts everything as a negative class.

Ok, I’ve tried to change learning rate and to verify data, but it seems that the network still have a pathetic accuracy. Don’t know what is going wrong.

Probably there is a someone who would like to get an access to my host with deployed notebook to help me with this problem? Because it seems that a time to give up comes :slight_smile:

It seems that probably you’re right about learning rate - I reduced it down to 1e-6 (also, switched to the RMSprop optimizer) and now the model has approximately ~70% accuracy after ~100 epochs. But I am not sure if this is the only reason, because I also re-created my data layout and rewritten again some fragments of the code.

Though still, even 70% accuracy is much lower then one provided in keras post about cats/dogs training, which was around 95-98%.

Interesting enough, that even using VGG16 code from lecture notebooks, I am still having very low accuracy. Though the tutor’s notebook have something like ~90% after single epoch. The only difference is a data layout - I am using my own scripts to place files into correct folders. And I’ve verified that they contain correct images.

Could anybody share a script/notebook which contains the full dogs/cats fine-tuning code, which is compatible with Python 3 and keras=1.2.0?

Hi,
I also had this issue and was able to fix it, but do not really understand how I fixed it. I tried the suggestions in this post, but they did not seem to help. To check that it was just a problem with the finetuned model I reran the linear model from the beginning of JH’s notebook:

lm = Sequential([ Dense(2, activation='softmax', input_shape=(1000,)) ])
lm.compile(optimizer=RMSprop(lr=0.1), loss='categorical_crossentropy', metrics=['accuracy'])
lm.fit(trn_features, trn_labels, nb_epoch=3, batch_size=batch_size, 
       validation_data=(val_features, val_labels))

With results:

Train on 23000 samples, validate on 2000 samples
Epoch 1/3
23000/23000 [==============================] - 0s - loss: 0.0794 - acc: 0.9784 - val_loss: 0.1147 - val_acc: 0.9745
Epoch 2/3
23000/23000 [==============================] - 0s - loss: 0.0806 - acc: 0.9781 - val_loss: 0.1190 - val_acc: 0.9725
Epoch 3/3
23000/23000 [==============================] - 0s - loss: 0.0809 - acc: 0.9793 - val_loss: 0.1224 - val_acc: 0.9740

Then I recreated the finetuned model and compiled, decreasing the learning rate from 0.1 to .001:

model.pop()
for layer in model.layers: layer.trainable=False

model.add(Dense(2, activation='softmax'))

batch_size = 64 

gen=image.ImageDataGenerator()
batches = gen.flow(trn_data, trn_labels, batch_size=batch_size, shuffle=True)
val_batches = gen.flow(val_data, val_labels, batch_size=batch_size, shuffle=False)

def fit_model(model, batches, val_batches, nb_epoch=1):
   model.fit_generator(batches, samples_per_epoch=batches.N, nb_epoch=nb_epoch, 
                    validation_data=val_batches, nb_val_samples=val_batches.N)

opt = RMSprop(lr=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

fit_model(model, batches, val_batches, nb_epoch=1)

This returns results with accuracy close to JH’s result:

Epoch 1/1
23000/23000 [==============================] - 625s - loss: 0.1207 - acc: 0.9683 - val_loss: 0.0576 - 
val_acc: 0.9855

I’m not entirely sure how this fixed the problem, but maybe it will be helpful for you.

Hi
Facing same issue withon python 3 and keras 1.2.2 would it be a config issue?