Hi,
As I needed to be able to read images from a S3 bucket I realised that it was not yet supported. Furthermore I noticed a few posts on the forum asking for help for this. So I wanted to share my humble attempt to build this feature.
from fastai.vision import *
from fastai.vision.image import pil2tensor
import torch
import boto3
import PIL
import numpy as np
import os
# No include/exclude but possible to add a custom filtering function
def get_s3_files(s3_resource,bucket, extensions, filter_func):
res = []
for object_ in s3_resource.Bucket(bucket).objects.all():
if os.path.splitext(object_.key)[1] in extensions:
res.append((bucket,object_.key))
if filter_func:
res = [i for i in res if filter_func(i)]
return res
# Extend the feature by inheritance
# We extend it by ovewrite a few methods
class ImageListS3(ImageList):
def __init__(self,s3_resource,*args,**kwargs):
super().__init__(*args,**kwargs)
self.s3_resource = s3_resource
self.copy_new.append('s3_resource')
# New class method that allows you to create an ImageList out of s3 files
@classmethod
def from_s3_files(cls,bucket, s3_resource = boto3.resource('s3'), extensions = ['.jpg','.png'], filter_func = None,**kwargs):
return cls(s3_resource,get_s3_files(s3_resource,bucket, extensions, filter_func),path='', **kwargs)
# Overwrites original open() method
# Just calls the method self.open_image() rather than the function open_image()
# Otherwise it call the function open_image() of the library, probably du to scope logics
# So that's just a workaround, could use the default if it mapped directly to our own open_image()
def open(self,s3_fn):
return self.open_image(s3_fn, convert_mode=self.convert_mode, after_open=self.after_open)
# Our own open_image() as a method, rather than the original function
def open_image(self, fn:PathOrStr, div:bool=True, convert_mode:str='RGB', cls:type=Image,
after_open:Callable=None)->Image:
"Return `Image` object created from image in file `fn`."
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning) # EXIF warning from TiffPlugin
file_byte_string = self.s3_resource.Object(*fn).get()['Body'].read()
x = PIL.Image.open(BytesIO(file_byte_string)).convert(convert_mode)
if after_open: x = after_open(x)
x = pil2tensor(x,np.float32)
if div: x.div_(255)
return cls(x)
I put a few explanations directly next to the relevant parts of the code.
You just have to pass the bucket name ( and optionally a boto3 s3 resource object) as such: ImageListS3.from_s3_files(bucket = 'my-bucket')
If you do not pass s3 resource, it will try to create one. So just make sure all your credentials are stored correctly so it works out when it runs boto3.resouce('s3')
I assume there is room for improvement. Any correction is welcome !
Cheers