@rob I generalized your PredictHapinessDataset for any DataFrame.
I could not find a better way to pass the path for the validation and test set. If someone has a proposition for improvement, I would really appreciate.
class DataFrameDataset(torchtext.data.Dataset):
def __init__(self, path, text_field, label_field, col, label, dfs, **kwargs):
fields = [("text", text_field), ("label", label_field)]
examples = []
for i in range(dfs[path].values[:,1].shape[0]):
text = dfs[path][col].iloc[i]
label = dfs[path][label].iloc[i]
examples.append(data.Example.fromlist([text, label], fields))
super().__init__(examples, fields, **kwargs)
@staticmethod
def sort_key(ex): return len(ex.text)
@classmethod
def splits(cls, text_field, label_field, path, col, label, train, validation=None, test=None, **kwargs):
dfs = {'train': train}
if validation is not None:
dfs['validation'] = validation
has_validation = 'validation'
else:
has_validation = None
if test is not None:
dfs['test'] = test
has_test = 'test'
else:
has_test = None
return super().splits(path,
text_field=text_field, label_field=label_field, col=col, label=label,
train='train', validation=has_validation, test=has_test, dfs=dfs, **kwargs)