Posting my code below, but I just wanted to share a few lessons that I’ve learned. The first thing I learned is extremely important is to only put the text field and the label field in the fields variable. This is important because there are other functions that use fields and if you have extra columns, it will error out. The second is that you don’t necessarily need to have everything in a separate file to make this work. In this instance I did because I didn’t understand how things worked, but if I were to do this again, I would just pull the data directly from a DataFrame to create my examples. Another reason I wanted to create this thread is to hear from other people that are having similar problems/solutions.
class PredictHappinessDataset(torchtext.data.Dataset):
def __init__(self, path, text_field, label_field, **kwargs):
fields = [("Description", text_field), ("Is_Response", label_field)]
examples = []
for label in ['happy', 'not_happy']:
for fname in iglob(os.path.join(path, label, '*.txt')):
with open(fname, 'r') as f: text = f.readline()
examples.append(data.Example.fromlist([text, label], fields))#[fields[1], fields[-1]]))
super().__init__(examples, fields, **kwargs)
@staticmethod
def sort_key(ex): return len(ex.Description)
@classmethod
def splits(cls, text_field, label_field, root='.data',
train='train', test='test', **kwargs):
return super().splits(
root, text_field=text_field, label_field=label_field,
train=train, validation=None, test=test, **kwargs)