Correct output type for Tensor created from DataFrame - custom new task tutorial

enryh · December 1, 2021, 3:32pm

I am following the custom new task tutorial and I am stuck at the mid-level API (trying to produce the steps in the tutorial), where I need to decide for the output type in the tuple created by encodes which need to work together with further Transforms.

from typing import Tuple

import numpy as np
import pandas as pd

from fastai.tabular.all import *

N, M = 150, 15

prop_na = 0.1

def create_df(N:int, M:int, scaling_factor:float=30.0, prop_na:float=0.0, start_idx:int=0):   
    X = np.random.rand(N, M)

    if prop_na>0.0 and prop_na<1.0:
        mask = ~(X < prop_na)
        X = np.where(mask, X, np.nan)
    
    X *= scaling_factor
    
    X = pd.DataFrame(X,
                  index=[f'sample_{i:0{len(str(N))}}' for i in range(start_idx, start_idx+N)],
                  columns=(f'feat_{i:0{len(str(M))}}' for i in range(M)))
    return X



train_X = create_df(N, M, prop_na=prop_na)
valid_X   = create_df(int(N*0.1), M, prop_na=prop_na)


class DatasetTransform(Transform):
    def __init__(self, df: pd.DataFrame):
        if not issubclass(type(df), pd.DataFrame):
            raise ValueError(
                f'please pass a pandas DataFrame, not: {type(df) = }')
        self.mask_obs = df.isna()  # .astype('uint8') # in case 0,1 is preferred
        self.data = df

    def encodes(self, idx): # -> Tuple[torch.Tensor, torch.Tensor]: # annotation is interpreted
        mask = self.mask_obs.iloc[idx]
        data = self.data.iloc[idx]
        # return (self.to_tensor(mask), self.to_tensor(data))
        return (Tensor(mask), Tensor(data))

    def to_tensor(self, s: pd.Series) -> torch.Tensor:
        return torch.from_numpy(s.values)


train_tl = TfmdLists(
    range(len(train_X)),
    DatasetTransform(train_X))
valid_tl = TfmdLists(
    range(len(valid_X)),
    DatasetTransform(valid_X))

dls = DataLoaders.from_dsets(train_tl, valid_tl,
#                              after_item=[Normalize],
#                              after_batch=[Normalize],
                             bs=4)
print(f"{DatasetTransform.encodes = }")
dls.one_batch()

which gives

# DatasetTransform.encodes = (object,object) -> encodes

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.]]),
 tensor([[25.1777, 29.3657, 21.3131,  8.1900, 17.6949, 24.3187,  5.9157, 28.1306,
           6.9885, 29.4994,  6.2627,  4.5678,     nan,     nan, 21.1621],
         [    nan, 17.1096,  5.7025, 20.7072, 25.2490,     nan, 21.9753, 22.0570,
          19.1355,  7.1306,  3.0970, 13.0816, 18.1946,     nan,     nan],
         [    nan, 29.9700, 29.6510, 19.5317, 27.2835, 16.0205,  5.0213, 12.9468,
           3.7647, 10.9581, 16.7813, 12.0690,  8.2697,  5.9754,  8.1054],
         [    nan, 17.2954, 28.0015, 12.9694, 15.8703,  8.4406,     nan, 18.5286,
           7.6147,     nan,  4.7754, 22.1028, 23.5528, 29.1192, 28.7750]]))

My question is how to get it to work with Normalize from Tabular? Do I need to use a different Normalize? Does it need initialization? I am a bit lost as the tutorial is for images and not Tabular data… Any help is much appreciated:)