@sariabod Thanks for your reply. I can post my example code, but to reproduce you’d need a docker image with fastai built and uploaded to ECR, then have this code ran through that container in sagemaker. I was hoping that it might be a more easy to track down issue, but maybe not 
# SageMaker CamVid U-Net
import ast
import argparse
import logging
import os
from pathlib import *
from fastai import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.utils.mem import *
from fastai.callbacks.mem import *
from fastai.callbacks.hooks import *
from fastai.callbacks.tracker import *
import multiprocessing as mp
# ignore the PIL warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# setup the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# get the host from environment variable
HOSTNAME = os.environ.get('SM_CURRENT_HOST', 'train-host')
def _train(args):
path = Path(args.data_dir)
print(path.ls())
path_lbl = path/'labels'
path_img = path/'images'
model_dir = Path(args.model_dir)
print(f'Model save dir: {model_dir}')
if verify_images == 1:
print('verifying images...')
verify_images(path_img, max_size=int(800))
fnames = get_image_files(path_img)
lbl_names = get_image_files(path_lbl)
get_y_fn = lambda x: path_lbl/f'{x.stem}_P{x.suffix}'
img_f = fnames[0]
img = open_image(img_f)
mask = open_mask(get_y_fn(img_f))
src_size = np.array(mask.shape[1:])
codes = np.loadtxt(path/'codes.txt', dtype=str);
size = src_size//2
# the max size of bs depends on the available GPU RAM
free = gpu_mem_get_free_no_cache()
print(f'free GPU memory: {free}')
src = (SegmentationItemList.from_folder(path_img)
.split_by_fname_file('../valid.txt')
.label_from_func(get_y_fn, classes=codes))
data = (src.transform(get_transforms(), size=size, tfm_y=True)
.databunch(bs=args.bs, num_workers=args.num_workers)
.normalize(imagenet_stats))
print(data.classes, data.c, len(data.train_ds), len(data.valid_ds))
name2id = {v:k for k,v in enumerate(codes)}
void_code = name2id['Void']
def acc_camvid(input, target):
target = target.squeeze(1)
mask = target != void_code
return (input.argmax(dim=1)[mask]==target[mask]).float().mean()
metrics=acc_camvid
wd=1e-2
callback_fns = [PeakMemMetric, partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.01, patience=3)]
callbacks = [TerminateOnNaNCallback()]
learn = unet_learner(data, models.resnet34, metrics=metrics, wd=wd,
callbacks=callbacks, callback_fns=callback_fns)
if args.lr_finder == 1:
print('starting lr finder...')
lr_find(learn)
print('GOT HERE: 1')
fig = learn.recorder.plot(suggestion=True, return_fig=True);
print('GOT HERE: 2')
fig.savefig(model_dir/'lr-1')
print('GOT HERE: 3')
min_grad_lr = learn.recorder.min_grad_lr
lr = min_grad_lr
else:
lr = 3e-3
print('starting fit 1...')
learn.fit_one_cycle(args.epochs, slice(lr), pct_start=0.9)
print('\n\nGOT HERE: 4\n\n')
learn.save(model_dir/args.save_name)
learn.load(model_dir/args.save_name);
if args.fine_tune == 1:
learn.unfreeze()
if args.lr_finder == 1:
print('starting lr finder...')
lr_find(learn)
fig2 = learn.recorder.plot(suggestion=True, return_fig=True)
fig2.savefig(model_dir/'lr-2')
min_grad_lr = learn.recorder.min_grad_lr
lr = min_grad_lr
lrs = slice(lr/100,lr)
else:
lrs = slice(lr/400,lr/4)
print('starting fit 2...')
learn.fit_one_cycle(12, lrs, pct_start=0.8)
learn.save(model_dir/args.save_name);
print(f'Writing classes to model dir')
save_texts(model_dir/'classes.txt', data.classes)
print(os.listdir("/opt/ml/model/"))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--verify_images', type=int, default=0,
help='check each image and delete if unable to read')
parser.add_argument('--fine_tune', type=int, default=1,
help='second training stage')
parser.add_argument('--lr_finder', type=int, default=0,
help='run lr finder')
parser.add_argument('--num_workers', type=int, default=mp.cpu_count(),
help='number of cpus to use')
parser.add_argument('--epochs', type=int, default=1, metavar='E',
help='number of total epochs to run')
parser.add_argument('--bs', type=int, default=4, metavar='BS',
help='batch size (default: 64)')
parser.add_argument('--save_name', type=str, default='model',
help='will save as <save_name>.pth')
# The parameters below retrieve their default values from SageMaker environment variables, which are
# instantiated by the SageMaker containers framework.
# https://github.com/aws/sagemaker-containers#how-a-script-is-executed-inside-the-container
parser.add_argument('--hosts', type=str, default=ast.literal_eval(os.environ['SM_HOSTS']))
parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])
_train(parser.parse_args())
So if I just place all of the _train
code directly inside main it works, but as is, saving causes that pickle error. Hmm I wonder if it has something to do with multiprocessing…