I am following the custom new task tutorial and I am stuck at the mid-level API (trying to produce the steps in the tutorial), where I need to decide for the output type in the tuple created by encodes
which need to work together with further Transforms.
from typing import Tuple
import numpy as np
import pandas as pd
from fastai.tabular.all import *
N, M = 150, 15
prop_na = 0.1
def create_df(N:int, M:int, scaling_factor:float=30.0, prop_na:float=0.0, start_idx:int=0):
X = np.random.rand(N, M)
if prop_na>0.0 and prop_na<1.0:
mask = ~(X < prop_na)
X = np.where(mask, X, np.nan)
X *= scaling_factor
X = pd.DataFrame(X,
index=[f'sample_{i:0{len(str(N))}}' for i in range(start_idx, start_idx+N)],
columns=(f'feat_{i:0{len(str(M))}}' for i in range(M)))
return X
train_X = create_df(N, M, prop_na=prop_na)
valid_X = create_df(int(N*0.1), M, prop_na=prop_na)
class DatasetTransform(Transform):
def __init__(self, df: pd.DataFrame):
if not issubclass(type(df), pd.DataFrame):
raise ValueError(
f'please pass a pandas DataFrame, not: {type(df) = }')
self.mask_obs = df.isna() # .astype('uint8') # in case 0,1 is preferred
self.data = df
def encodes(self, idx): # -> Tuple[torch.Tensor, torch.Tensor]: # annotation is interpreted
mask = self.mask_obs.iloc[idx]
data = self.data.iloc[idx]
# return (self.to_tensor(mask), self.to_tensor(data))
return (Tensor(mask), Tensor(data))
def to_tensor(self, s: pd.Series) -> torch.Tensor:
return torch.from_numpy(s.values)
train_tl = TfmdLists(
range(len(train_X)),
DatasetTransform(train_X))
valid_tl = TfmdLists(
range(len(valid_X)),
DatasetTransform(valid_X))
dls = DataLoaders.from_dsets(train_tl, valid_tl,
# after_item=[Normalize],
# after_batch=[Normalize],
bs=4)
print(f"{DatasetTransform.encodes = }")
dls.one_batch()
which gives
# DatasetTransform.encodes = (object,object) -> encodes
(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
[1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.]]),
tensor([[25.1777, 29.3657, 21.3131, 8.1900, 17.6949, 24.3187, 5.9157, 28.1306,
6.9885, 29.4994, 6.2627, 4.5678, nan, nan, 21.1621],
[ nan, 17.1096, 5.7025, 20.7072, 25.2490, nan, 21.9753, 22.0570,
19.1355, 7.1306, 3.0970, 13.0816, 18.1946, nan, nan],
[ nan, 29.9700, 29.6510, 19.5317, 27.2835, 16.0205, 5.0213, 12.9468,
3.7647, 10.9581, 16.7813, 12.0690, 8.2697, 5.9754, 8.1054],
[ nan, 17.2954, 28.0015, 12.9694, 15.8703, 8.4406, nan, 18.5286,
7.6147, nan, 4.7754, 22.1028, 23.5528, 29.1192, 28.7750]]))
My question is how to get it to work with Normalize from Tabular? Do I need to use a different Normalize? Does it need initialization? I am a bit lost as the tutorial is for images and not Tabular data… Any help is much appreciated:)