How can I load a pretrained model on Kaggle using fastai?

When I try to run this code:

learn = ConvLearner.pretrained(f_model, md, metrics=[accuracy])

I get an error because the kernel on Kaggle cannot download the weights from the internet

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /tmp/.torch/models/resnet34-333f7ec4.pth
---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
/opt/conda/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1317                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error

/opt/conda/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/opt/conda/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/opt/conda/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/opt/conda/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/opt/conda/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

/opt/conda/lib/python3.6/http/client.py in connect(self)
   1391 
-> 1392             super().connect()
   1393 

/opt/conda/lib/python3.6/http/client.py in connect(self)
    935         self.sock = self._create_connection(
--> 936             (self.host,self.port), self.timeout, self.source_address)
    937         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/opt/conda/lib/python3.6/socket.py in create_connection(address, timeout, source_address)
    703     err = None
--> 704     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
    705         af, socktype, proto, canonname, sa = res

/opt/conda/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    744     addrlist = []
--> 745     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    746         af, socktype, proto, canonname, sa = res

gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-68-b0bc3f5d394a> in <module>()
----> 1 learn = ConvLearner.pretrained(f_model, md, metrics=[accuracy])
      2 learn.opt_fn = optim.Adam

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/conv_learner.py in pretrained(cls, f, data, ps, xtra_fc, xtra_cut, custom_head, precompute, **kwargs)
     98     def pretrained(cls, f, data, ps=None, xtra_fc=None, xtra_cut=0, custom_head=None, precompute=False, **kwargs):
     99         models = ConvnetBuilder(f, data.c, data.is_multi, data.is_reg,
--> 100             ps=ps, xtra_fc=xtra_fc, xtra_cut=xtra_cut, custom_head=custom_head)
    101         return cls(data, models, precompute, **kwargs)
    102 

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/conv_learner.py in __init__(self, f, c, is_multi, is_reg, ps, xtra_fc, xtra_cut, custom_head)
     36         else: cut,self.lr_cut = 0,0
     37         cut-=xtra_cut
---> 38         layers = cut_model(f(True), cut)
     39         self.nf = model_features[f] if f in model_features else (num_features(layers)*2)
     40         if not custom_head: layers += [AdaptiveConcatPool2d(), Flatten()]

/opt/conda/lib/python3.6/site-packages/torchvision-0.2.0-py3.6.egg/torchvision/models/resnet.py in resnet34(pretrained, **kwargs)
    174     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    175     if pretrained:
--> 176         model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    177     return model
    178 

/opt/conda/lib/python3.6/site-packages/torch/utils/model_zoo.py in load_url(url, model_dir, map_location, progress)
     63         sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
     64         hash_prefix = HASH_REGEX.search(filename).group(1)
---> 65         _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
     66     return torch.load(cached_file, map_location=map_location)
     67 

/opt/conda/lib/python3.6/site-packages/torch/utils/model_zoo.py in _download_url_to_file(url, dst, hash_prefix, progress)
     68 
     69 def _download_url_to_file(url, dst, hash_prefix, progress):
---> 70     u = urlopen(url)
     71     if requests_available:
     72         file_size = int(u.headers["Content-Length"])

/opt/conda/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/opt/conda/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response

/opt/conda/lib/python3.6/urllib/request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result

/opt/conda/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/opt/conda/lib/python3.6/urllib/request.py in https_open(self, req)
   1359         def https_open(self, req):
   1360             return self.do_open(http.client.HTTPSConnection, req,
-> 1361                 context=self._context, check_hostname=self._check_hostname)
   1362 
   1363         https_request = AbstractHTTPHandler.do_request_

/opt/conda/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error
-> 1320                 raise URLError(err)
   1321             r = h.getresponse()
   1322         except:

URLError: <urlopen error [Errno -2] Name or service not known>

The weights are available here, but I’m not sure how to load them without having fastai try to download the weights from the internet.

Any ideas?

3 Likes

In order to access the Pytorch weights in Kaggle kernels you would need to access them first from a Kaggle dataset (like the one you linked to) but then I believe you would first need to copy the weight files to the location where Pytorch is looking for them. If Pytorch sees the weights are already there then it won’t try to download them. You can do all of this in the Kaggle kernel.

I found someone did something similar here in a Kaggle kernel for Keras weights so maybe it would be helpful as a reference.
https://www.kaggle.com/gaborfodor/dog-breed-pretrained-keras-models-lb-0-3/notebook

4 Likes

Hi James,

Thanks for the help!

Here’s what I did:

Create the models directory:

cache_dir = expanduser(join('~', '.torch'))
if not exists(cache_dir):
    makedirs(cache_dir)
models_dir = join(cache_dir, 'models')
if not exists(models_dir):
    makedirs(models_dir)

Copy the weights over:

!cp ../input/resnet34/resnet34.pth /tmp/.torch/models/resnet34-333f7ec4.pth

Now I get this error

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-73-b0bc3f5d394a> in <module>()
----> 1 learn = ConvLearner.pretrained(f_model, md, metrics=[accuracy])
      2 learn.opt_fn = optim.Adam

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/conv_learner.py in pretrained(cls, f, data, ps, xtra_fc, xtra_cut, custom_head, precompute, **kwargs)
     99         models = ConvnetBuilder(f, data.c, data.is_multi, data.is_reg,
    100             ps=ps, xtra_fc=xtra_fc, xtra_cut=xtra_cut, custom_head=custom_head)
--> 101         return cls(data, models, precompute, **kwargs)
    102 
    103     @property

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/conv_learner.py in __init__(self, data, models, precompute, **kwargs)
     85     def __init__(self, data, models, precompute=False, **kwargs):
     86         self.precompute = False
---> 87         super().__init__(data, models, **kwargs)
     88         if hasattr(data, 'is_multi'):
     89             self.crit = F.binary_cross_entropy if data.is_multi else F.nll_loss

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/learner.py in __init__(self, data, models, opt_fn, tmp_name, models_name, metrics, clip)
     22         self.tmp_path = os.path.join(self.data.path, tmp_name)
     23         self.models_path = os.path.join(self.data.path, models_name)
---> 24         os.makedirs(self.tmp_path, exist_ok=True)
     25         os.makedirs(self.models_path, exist_ok=True)
     26         self.crit,self.reg_fn = None,None

/opt/conda/lib/python3.6/os.py in makedirs(name, mode, exist_ok)
    218             return
    219     try:
--> 220         mkdir(name, mode)
    221     except OSError:
    222         # Cannot rely on checking for EEXIST, since the operating system

OSError: [Errno 30] Read-only file system: '../input/tmp'

So fastai tries to write to the data directory, which is read-only.

I set the path to the working directory

md.path = pathlib.Path('.')

and now it runs.

7 Likes

@z0k I am getting the
OSError: [Errno 30] Read-only file system: '../input/tmp'
even after doing:
data.path = pathlib.Path('.')

Sounds like you haven’t connected the Kaggle dataset.

@jeremy I have connected both the planets as well as the resnet pretrained weights datasets and they are being shown in kaggle/input/. I also tested displaying a few images in the kernal and it is working. Here is the link to my kaggle kernal : https://www.kaggle.com/shivamsaboo17/planetsfastai/code


Here is the code:
import pathlib
data.path = pathlib.Path('.')
data = data.resize(int(sz * 1.3), '.') # Tells to ignore training images more than sz * 1.3 to save time
learn = ConvLearner.pretrained(model, data, metrics=metrics)
And the error logs for the above code:
OSError Traceback (most recent call last)
in ()
1 import pathlib
2 data.path = pathlib.Path(’.’)
----> 3 data = data.resize(int(sz * 1.3), ‘.’) # Tells to ignore training images more than sz * 1.3 to save time
4 learn = ConvLearner.pretrained(model, data, metrics=metrics)

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in resize(self, targ_sz, new_path)
344 else: dls += [None,None]
345 t = tqdm_notebook(dls)
–> 346 for dl in t: new_ds.append(self.resized(dl, targ_sz, new_path))
347 t.close()
348 return self.class(new_ds[0].path, new_ds, self.bs, self.num_workers, self.classes)

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in resized(self, dl, targ, new_path)
336
337 def resized(self, dl, targ, new_path):
–> 338 return dl.dataset.resize_imgs(targ,new_path) if dl else None
339
340 def resize(self, targ_sz, new_path=‘tmp’):

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in resize_imgs(self, targ, new_path)
233
234 def resize_imgs(self, targ, new_path):
–> 235 dest = resize_imgs(self.fnames, targ, self.path, new_path)
236 return self.class(self.fnames, self.y, self.transform, dest)
237

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in resize_imgs(fnames, targ, path, new_path)
29 with ThreadPoolExecutor(8) as e:
30 ims = e.map(lambda x: resize_img(x, targ, path, new_path), fnames)
—> 31 for x in tqdm(ims, total=len(fnames), leave=False): pass
32 return os.path.join(path,new_path,str(targ))
33

/opt/conda/lib/python3.6/site-packages/tqdm/_tqdm.py in iter(self)
895 “”", fp_write=getattr(self.fp, ‘write’, sys.stderr.write))
896
–> 897 for obj in iterable:
898 yield obj
899 # Update and possibly print the progressbar.

/opt/conda/lib/python3.6/concurrent/futures/_base.py in result_iterator()
584 # Careful not to keep a reference to the popped future
585 if timeout is None:
–> 586 yield fs.pop().result()
587 else:
588 yield fs.pop().result(end_time - time.time())

/opt/conda/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
423 raise CancelledError()
424 elif self._state == FINISHED:
–> 425 return self.__get_result()
426
427 self._condition.wait(timeout)

/opt/conda/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
–> 384 raise self._exception
385 else:
386 return self._result

/opt/conda/lib/python3.6/concurrent/futures/thread.py in run(self)
54
55 try:
—> 56 result = self.fn(*self.args, **self.kwargs)
57 except BaseException as exc:
58 self.future.set_exception(exc)

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in (x)
28 if not os.path.exists(os.path.join(path,new_path,str(targ),fnames[0])):
29 with ThreadPoolExecutor(8) as e:
—> 30 ims = e.map(lambda x: resize_img(x, targ, path, new_path), fnames)
31 for x in tqdm(ims, total=len(fnames), leave=False): pass
32 return os.path.join(path,new_path,str(targ))

/opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/dataset.py in resize_img(fname, targ, path, new_path)
22 ratio = targ/min(r,c)
23 sz = (scale_to(r, ratio, targ), scale_to(c, ratio, targ))
—> 24 os.makedirs(os.path.split(dest)[0], exist_ok=True)
25 im.resize(sz, Image.LINEAR).save(dest)
26

/opt/conda/lib/python3.6/os.py in makedirs(name, mode, exist_ok)
208 if head and tail and not path.exists(head):
209 try:
–> 210 makedirs(head, mode, exist_ok)
211 except FileExistsError:
212 # Defeats race condition when another thread created the path

/opt/conda/lib/python3.6/os.py in makedirs(name, mode, exist_ok)
218 return
219 try:
–> 220 mkdir(name, mode)
221 except OSError:
222 # Cannot rely on checking for EEXIST, since the operating system

OSError: [Errno 30] Read-only file system: ‘…/input/planet-understanding-the-amazon-from-space/./83’

1 Like

You can skip the resizing step. Not much point using that in a kaggle kernel.

1 Like

Thanks that worked for now :slight_smile:
Just wanted to know if there is any other way for using resizing functionality in kaggle kernal?

Hi @shivamsaboo17,

I don’t know of a workaround other than what I posted above.

Maybe it would be a good idea to allow the user to specify the path for a temp directory as a global config option?

1 Like

Hi all! I’ve been investigating using Kaggle Kernels on GPU as a quick way to start using fastai, and I ran into this same issue, which I found pretty frustrating. So I made a PR to the Kaggle docker-python image to download some commonly-used torchvision models (I started with resnets). Take a look–I hope the PR gets accepted so that people won’t keep getting stuck on this setup step (https://github.com/Kaggle/docker-python/pull/233)

2 Likes

I was also struggling here. The steps I followed were:

  1. Add dataset to the Kaggle Kernel
  2. Enable internet access so that the weights can be downloaded.
  3. Assuming the directory of the data is PATH, create symbolic links in the Kaggle working directory. e.g.

!ln -s {PATH}train
!ln -s {PATH}test1
!ln -s {PATH}valid
4. Change PATH to be working directory:
PATH = ‘/kaggle/working/’

4 Likes

I am facing problem when trying to use resnext101_64 for Dog breed identification.The code searches for weights at /opt/conda/lib/python3.6/site-packages/fastai-0.6-py3.6.egg/fastai/weights.
Now as the location is read only I cannot push the downloaded weights to this location.

I am unable to download Resnext101_64 while being able to do for other arch’s like resnet101 and resnet32. What might be the reason?

Here is a snippet that could help for running pretrained without downloading everytime in kaggle notebook.
In data, add ResNet34, ResNet50, ResNet101, ResNet152, ResNet18 etc…

!mkdir -p ~/.torch/models
for model_name in model_urls:
    model_filename=os.path.basename(model_urls[model_name])
    dst=os.path.expanduser(os.path.join("~/.torch/models",model_filename))
    src=os.path.expanduser(os.path.join("/kaggle/input",model_name,model_name+".pth"))
    if os.path.exists(src) and not os.path.exists(dst):
        print("{}: {} -> {}".format(model_name, src, dst))
        os.symlink(src, dst)
!ls ~/.torch/models
2 Likes

Hello,

I’m running into the same issue - ConvLearner looking for weights in ‘/opt/conda/lib/python3.6/site-packages/fastai/weights/resnext_101_64x4d.pth’.
I could successfully run dog breeds identification with Resnet34 but unable to use resnext101 weghts in kaggle kernel. This is where I uploaded the weights - https://www.kaggle.com/vkodedal/resnext101-64
I copied weights to tmp dir using : !cp …/input/resnext101-64/resnext_101_64x4d.pth /tmp/.torch/models/resnext_101_64x4d.pth
Now, how do I make ConvLearner look for weights at the above tmp location ?

Thank you,
Vidyadhar

The name should be resnet101-5d3b4d8f.pth
!cp …/input/resnext101-64/resnext_101_64x4d.pth ~/.torch/models/resnet101-5d3b4d8f.pth
The pytorch file name should be of format <resnetname>-<sha-hash>.pth

Regards,
Karthikeyan Natarajan

1 Like

I tried it -
ls -l ~/.torch/models/
total 412528 -rw-r–r-- 1 root root 334698549 Nov 16 11:30 resnet101-5d3b4d8f.pth

still getting the same error
FileNotFoundError: [Errno 2] No such file or directory: ‘/opt/conda/lib/python3.6/site-packages/fastai/weights/resnext_101_64x4d.pth’

Load the the following dataset to your kaggle kernel
https://www.kaggle.com/iafoss/pytorch-pretrained-models or your model
and then

!ln -s /kaggle/input/pytorch-pretrained-models/ /opt/conda/lib/python3.6/site-packages/fastai/weights
1 Like

Thank you Karthikeyan.
It worked and I could complete dog breed identification with Resnext101.

I can’t do it - read-only system
My kernel is https://www.kaggle.com/sq5rix/hcd-fast-ai/
I can run in my laptop, but I fail at kaggle kernel