My S3 ImageList version

Hi,

As I needed to be able to read images from a S3 bucket I realised that it was not yet supported. Furthermore I noticed a few posts on the forum asking for help for this. So I wanted to share my humble attempt to build this feature.

from fastai.vision import *
from fastai.vision.image import pil2tensor
import torch
import boto3
import PIL
import numpy as np
import os

# No include/exclude but possible to add a custom filtering function
def get_s3_files(s3_resource,bucket, extensions, filter_func):
    res = []
    for object_ in s3_resource.Bucket(bucket).objects.all():
        if os.path.splitext(object_.key)[1] in extensions:
            res.append((bucket,object_.key))
    if filter_func: 
        res = [i for i in res if filter_func(i)]
    return res

# Extend the feature by inheritance
# We extend it by ovewrite a few methods
class ImageListS3(ImageList):
    def __init__(self,s3_resource,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.s3_resource = s3_resource
        self.copy_new.append('s3_resource')
        
    # New class method that allows you to create an ImageList out of s3 files
    @classmethod  
    def from_s3_files(cls,bucket, s3_resource = boto3.resource('s3'), extensions = ['.jpg','.png'], filter_func = None,**kwargs):
        return cls(s3_resource,get_s3_files(s3_resource,bucket, extensions, filter_func),path='', **kwargs)
    
    # Overwrites original open() method
    # Just calls the method self.open_image() rather than the function open_image()
    # Otherwise it call the function open_image() of the library, probably du to scope logics
    # So that's just a workaround, could use the default if it mapped directly to our own open_image()
    def open(self,s3_fn):
        return self.open_image(s3_fn, convert_mode=self.convert_mode, after_open=self.after_open)
    
    # Our own open_image() as a method, rather than the original function
    def open_image(self, fn:PathOrStr, div:bool=True, convert_mode:str='RGB', cls:type=Image,
        after_open:Callable=None)->Image:
        "Return `Image` object created from image in file `fn`."
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning) # EXIF warning from TiffPlugin
            file_byte_string = self.s3_resource.Object(*fn).get()['Body'].read()
            x = PIL.Image.open(BytesIO(file_byte_string)).convert(convert_mode)
        if after_open: x = after_open(x)
        x = pil2tensor(x,np.float32)
        if div: x.div_(255)
        return cls(x)

I put a few explanations directly next to the relevant parts of the code.

You just have to pass the bucket name ( and optionally a boto3 s3 resource object) as such: ImageListS3.from_s3_files(bucket = 'my-bucket')
If you do not pass s3 resource, it will try to create one. So just make sure all your credentials are stored correctly so it works out when it runs boto3.resouce('s3')

I assume there is room for improvement. Any correction is welcome !
Cheers

5 Likes

Nice work! One thing you could try is to directly write to the IO using boto fileobject

s3.download_fileobj(image_name, s3_image_as_bytes)
s3_image_as_bytes.seek(0)