I have a custom per-character tokenizer with some custom preprocessing rules.
from fastai.text.core import fix_html, lowercase
def rm_useless_whitespace(text: str)->str:
return re.sub(r'\s+','_',text)
def rm_diacritics(text: str)->str:
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
def remove_all_digits(text):
return re.sub(r'\d', '', text, flags=re.UNICODE)
def custom_lowercase(text: str)->str:
return text.lower().strip()
def latin_only(text: str)->str:
return re.sub(r'[^A-Za-z\s_]','',text)
# Note that lowercase is the thing that adds the BOS token
my_rules = fix_html, remove_all_digits, latin_only, lowercase, rm_useless_whitespace
class CharTokenizer(Transform):
def __init__(self):
pass
def SafeSplit(self, text):
text = re.sub(r'xxbos','X',text)
print("Text before splitting: ", text)
chars = list(text)
return ['xxbos' if c=='X' else c for c in chars]
def __call__(self, items):
# first, replace xxbos with X (we're already lowercase, so X is unique
return (self.SafeSplit(t) for t in items)
my_tokenizer=Tokenizer(tok=CharTokenizer(),rules=my_rules)
my_vocab = ['xxunk','xxbos'] + list('abcdefghijklmnopqrstuvwxyz_')
It works when I call it with a premade string array:
texts = ["This is a normal string", "This 123 string 456 contains 790 numbers.", 'Pun..ctua..tio,"n and $£%&*£$" special chars']
text_block = TextBlock(tok_tfm=my_tokenizer, is_lm=True, vocab=my_vocab)
dls_lm = DataBlock(
blocks=(text_block,),
get_x=lambda x: x,
splitter=RandomSplitter(0.1)
).dataloaders(texts, bs=3, seq_len=30)
returns
xxbos_this_is_a_normal_stringxxbos_p
But when I call it on the IMDB data using TextBlock.from_folder the spaces are replaced with xxunk instead of ‘_’
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])
dls_lm = DataBlock(
blocks=TextBlock.from_folder(path, tok=my_tokenizer, is_lm=True, vocab=my_vocab),
get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=1, seq_len=80)
returns
xxbosixxunkknewxxunkitxxunkwasntxxunkgunnaxxunkworkxxunkoutxxunkbetweenxxunkmexxunkandxxunkdwarsxxunkfromxxunkthexxunkmomentxxunkwexxunkmetxxunkfirs
I swear I’ve read the documentation, but if the answer is there then I haven’t understood it.
[Edit] Based on some further experimentation, it appears that the rules I defined don’t get applied in from_folder
- for instance a rule that replaces t
with p
does nothing.