Upgrading to Fast ai 2.1.4 and PyTorch 1.7.0 causes the Text Models to fail

After upgrading today to latest versions of Fastai 2.1.4 and PyTorch 1.7.0
The following code now fails in 01_intro.ipynb

from fastai.text.all import *
dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid=‘test’,num_workers=0,bs=24)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

Error message is this :
UnicodeDecodeError: ‘charmap’ codec can’t decode byte 0x81 in position 54: character maps to

Full dump of stack is the following, it is long so good luck to the Fast ai team to spot the issue.


UnicodeDecodeError Traceback (most recent call last)
in
3 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid=‘test’,num_workers=0,bs=24)
4 learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
----> 5 learn.fine_tune(4, 1e-2)

~\anaconda3\envs\fastai\lib\site-packages\fastcore\logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

~\anaconda3\envs\fastai\lib\site-packages\fastai\callback\schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
159 “Fine tune with freeze for freeze_epochs then with unfreeze from epochs using discriminative LR”
160 self.freeze()
–> 161 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
162 base_lr /= 2
163 self.unfreeze()

~\anaconda3\envs\fastai\lib\site-packages\fastcore\logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

~\anaconda3\envs\fastai\lib\site-packages\fastai\callback\schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
111 scheds = {‘lr’: combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
112 ‘mom’: combined_cos(pct_start, *(self.moms if moms is None else moms))}
–> 113 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
114
115 # Cell

~\anaconda3\envs\fastai\lib\site-packages\fastcore\logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
205 self.opt.set_hypers(lr=self.lr if lr is None else lr)
206 self.n_epoch = n_epoch
–> 207 self._with_events(self._do_fit, ‘fit’, CancelFitException, self._end_cleanup)
208
209 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before
{event_type}’) ;f()
156 except ex: self(f’after_cancel
{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in _do_fit(self)
195 for epoch in range(self.n_epoch):
196 self.epoch=epoch
–> 197 self._with_events(self._do_epoch, ‘epoch’, CancelEpochException)
198
199 @log_args(but=‘cbs’)

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before
{event_type}’) ;f()
156 except ex: self(f’after_cancel
{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in _do_epoch(self)
189
190 def _do_epoch(self):
–> 191 self._do_epoch_train()
192 self._do_epoch_validate()
193

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in _do_epoch_train(self)
181 def _do_epoch_train(self):
182 self.dl = self.dls.train
–> 183 self._with_events(self.all_batches, ‘train’, CancelTrainException)
184
185 def _do_epoch_validate(self, ds_idx=1, dl=None):

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before
{event_type}’) ;f()
156 except ex: self(f’after_cancel
{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

~\anaconda3\envs\fastai\lib\site-packages\fastai\learner.py in all_batches(self)
159 def all_batches(self):
160 self.n_iter = len(self.dl)
–> 161 for o in enumerate(self.dl): self.one_batch(*o)
162
163 def _do_one_batch(self):

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\load.py in iter(self)
100 self.before_iter()
101 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
–> 102 for b in _loadersself.fake_l.num_workers==0:
103 if self.device is not None: b = to_device(b, self.device)
104 yield self.after_batch(b)

~\anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in next(self)
433 if self._sampler_iter is None:
434 self._reset()
–> 435 data = self._next_data()
436 self._num_yielded += 1
437 if self._dataset_kind == _DatasetKind.Iterable and \

~\anaconda3\envs\fastai\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
473 def _next_data(self):
474 index = self._next_index() # may raise StopIteration
–> 475 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
476 if self._pin_memory:
477 data = _utils.pin_memory.pin_memory(data)

~\anaconda3\envs\fastai\lib\site-packages\torch\utils\data_utils\fetch.py in fetch(self, possibly_batched_index)
32 raise StopIteration
33 else:
—> 34 data = next(self.dataset_iter)
35 return self.collate_fn(data)
36

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\load.py in create_batches(self, samps)
109 self.it = iter(self.dataset) if self.dataset is not None else None
110 res = filter(lambda o:o is not None, map(self.do_item, samps))
–> 111 yield from map(self.do_batch, self.chunkify(res))
112
113 def new(self, dataset=None, cls=None, **kwargs):

~\anaconda3\envs\fastai\lib\site-packages\fastcore\basics.py in chunked(it, chunk_sz, drop_last, n_chunks)
196 if not isinstance(it, Iterator): it = iter(it)
197 while True:
–> 198 res = list(itertools.islice(it, chunk_sz))
199 if res and (len(res)==chunk_sz or not drop_last): yield res
200 if len(res)<chunk_sz: return

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\load.py in do_item(self, s)
122 def prebatched(self): return self.bs is None
123 def do_item(self, s):
–> 124 try: return self.after_item(self.create_item(s))
125 except SkipItemException: return None
126 def chunkify(self, b): return b if self.prebatched else chunked(b, self.bs, self.drop_last)

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\load.py in create_item(self, s)
128 def randomize(self): self.rng = random.Random(self.rng.randint(0,2**32-1))
129 def retain(self, res, b): return retain_types(res, b[0] if is_listy(b) else b)
–> 130 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
131 def create_batch(self, b): return (fa_collate,fa_convert)self.prebatched
132 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\core.py in getitem(self, it)
312
313 def getitem(self, it):
–> 314 res = tuple([tl[it] for tl in self.tls])
315 return res if is_indexer(it) else list(zip(*res))
316

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\core.py in (.0)
312
313 def getitem(self, it):
–> 314 res = tuple([tl[it] for tl in self.tls])
315 return res if is_indexer(it) else list(zip(*res))
316

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\core.py in getitem(self, idx)
278 res = super().getitem(idx)
279 if self._after_item is None: return res
–> 280 return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)
281
282 # Cell

~\anaconda3\envs\fastai\lib\site-packages\fastai\data\core.py in _after_item(self, o)
240 return super()._new(items, tfms=self.tfms, do_setup=False, types=self.types, split_idx=split_idx, **kwargs)
241 def subset(self, i): return self._new(self._get(self.splits[i]), split_idx=i)
–> 242 def _after_item(self, o): return self.tfms(o)
243 def repr(self): return f"{self.class.name}: {self.items}\ntfms - {self.tfms.fs}"
244 def iter(self): return (self[i] for i in range(len(self)))

~\anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in call(self, o)
196 self.fs.append(t)
197
–> 198 def call(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
199 def repr(self): return f"Pipeline: {’ -> '.join([f.name for f in self.fs if f.name != ‘noop’])}"
200 def getitem(self,i): return self.fs[i]

~\anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
148 for f in tfms:
149 if not is_enc: f = f.decode
–> 150 x = f(x, **kwargs)
151 return x
152

~\anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in call(self, x, **kwargs)
71 @property
72 def name(self): return getattr(self, ‘_name’, _get_name(self))
—> 73 def call(self, x, **kwargs): return self._call(‘encodes’, x, **kwargs)
74 def decode (self, x, **kwargs): return self._call(‘decodes’, x, **kwargs)
75 def repr(self): return f’{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}’

~\anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in _call(self, fn, x, split_idx, **kwargs)
81 def _call(self, fn, x, split_idx=None, **kwargs):
82 if split_idx!=self.split_idx and self.split_idx is not None: return x
—> 83 return self._do_call(getattr(self, fn), x, **kwargs)
84
85 def _do_call(self, f, x, **kwargs):

~\anaconda3\envs\fastai\lib\site-packages\fastcore\transform.py in do_call(self, f, x, **kwargs)
87 if f is None: return x
88 ret = f.returns_none(x) if hasattr(f,‘returns_none’) else None
—> 89 return retain_type(f(x, **kwargs), x, ret)
90 res = tuple(self.do_call(f, x, **kwargs) for x
in x)
91 return retain_type(res, x)

~\anaconda3\envs\fastai\lib\site-packages\fastcore\dispatch.py in call(self, *args, **kwargs)
127 elif self.inst is not None: f = MethodType(f, self.inst)
128 elif self.owner is not None: f = MethodType(f, self.owner)
–> 129 return f(*args, **kwargs)
130
131 def get(self, inst, owner):

~\anaconda3\envs\fastai\lib\site-packages\fastai\text\core.py in encodes(self, o)
290 if self.mode==‘folder’ and str(o).startswith(str(self.path)):
291 tok = self.output_dir/o.relative_to(self.path)
–> 292 return L(tok.read_text().split(’ '))
293 else: return self._tokenize1(o.read_text())
294

~\anaconda3\envs\fastai\lib\pathlib.py in read_text(self, encoding, errors)
1231 “”"
1232 with self.open(mode=‘r’, encoding=encoding, errors=errors) as f:
-> 1233 return f.read()
1234
1235 def write_bytes(self, data):

~\anaconda3\envs\fastai\lib\encodings\cp1252.py in decode(self, input, final)
21 class IncrementalDecoder(codecs.IncrementalDecoder):
22 def decode(self, input, final=False):
—> 23 return codecs.charmap_decode(input,self.errors,decoding_table)[0]
24
25 class StreamWriter(Codec,codecs.StreamWriter):

UnicodeDecodeError: ‘charmap’ codec can’t decode byte 0x81 in position 54: character maps to

I am having a similar issue with fastai 2.1.9 and pytorch 1.7.1 (on Windows 10):

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-8-af52b5b168b7> in <module>
----> 1 dls = TextDataLoaders.from_folder(source, valid='test', num_workers=0, encoding='utf-8')

C:\ProgramData\Anaconda3\lib\site-packages\fastai\text\data.py in from_folder(cls, path, train, valid, valid_pct, seed, vocab, text_vocab, is_lm, tok_tfm, seq_len, backwards, **kwargs)
    249         "Create from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)"
    250         splitter = GrandparentSplitter(train_name=train, valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct, seed=seed)
--> 251         blocks = [TextBlock.from_folder(path, text_vocab, is_lm, seq_len, backwards) if tok_tfm is None else TextBlock(tok_tfm, text_vocab, is_lm, seq_len, backwards)]
    252         if not is_lm: blocks.append(CategoryBlock(vocab=vocab))
    253         get_items = partial(get_text_files, folders=[train,valid]) if valid_pct is None else get_text_files

C:\ProgramData\Anaconda3\lib\site-packages\fastai\text\data.py in from_folder(cls, path, vocab, is_lm, seq_len, backwards, min_freq, max_vocab, **kwargs)
    237     def from_folder(cls, path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):
    238         "Build a `TextBlock` from a `path`"
--> 239         return cls(Tokenizer.from_folder(path, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,
    240                    backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)
    241 

C:\ProgramData\Anaconda3\lib\site-packages\fastai\text\core.py in from_folder(cls, path, tok, rules, **kwargs)
    275         path = Path(path)
    276         if tok is None: tok = WordTokenizer()
--> 277         output_dir = tokenize_folder(path, tok=tok, rules=rules, **kwargs)
    278         res = cls(tok, counter=load_pickle(output_dir/fn_counter_pkl),
    279                   lengths=load_pickle(output_dir/fn_lengths_pkl), rules=rules, mode='folder')

C:\ProgramData\Anaconda3\lib\site-packages\fastai\text\core.py in tokenize_folder(path, extensions, folders, output_dir, skip_if_exists, **kwargs)
    182     files = get_files(path, extensions=extensions, recurse=True, folders=folders)
    183     def _f(i,output_dir): return output_dir/files[i].relative_to(path)
--> 184     return _tokenize_files(_f, files, path, skip_if_exists=skip_if_exists, **kwargs)
    185 
    186 # Cell

C:\ProgramData\Anaconda3\lib\site-packages\fastai\text\core.py in _tokenize_files(func, files, path, output_dir, output_names, n_workers, rules, tok, encoding, skip_if_exists)
    167     for i,tok in parallel_tokenize(files, tok, rules, n_workers=n_workers):
    168         out = func(i,output_dir)
--> 169         out.mk_write(' '.join(tok))
    170         lengths[str(files[i].relative_to(path))] = len(tok)
    171         counter.update(tok)

C:\ProgramData\Anaconda3\lib\site-packages\fastcore\xtras.py in mk_write(self, data, encoding, errors, mode)
    199     "Make all parent dirs of `self`, and write `data`"
    200     self.parent.mkdir(exist_ok=True, parents=True, mode=mode)
--> 201     self.write_text(data, encoding=encoding, errors=errors)
    202 
    203 # Cell

C:\ProgramData\Anaconda3\lib\pathlib.py in write_text(self, data, encoding, errors)
   1250                             data.__class__.__name__)
   1251         with self.open(mode='w', encoding=encoding, errors=errors) as f:
-> 1252             return f.write(data)
   1253 
   1254     def touch(self, mode=0o666, exist_ok=True):

C:\ProgramData\Anaconda3\lib\encodings\cp1252.py in encode(self, input, final)
     17 class IncrementalEncoder(codecs.IncrementalEncoder):
     18     def encode(self, input, final=False):
---> 19         return codecs.charmap_encode(input,self.errors,encoding_table)[0]
     20 
     21 class IncrementalDecoder(codecs.IncrementalDecoder):

UnicodeEncodeError: 'charmap' codec can't encode character '\x85' in position 458: character maps to <undefined>
1 Like