I’m trying to train a language model on a corpus of ~64 GB of domain specific text. My machine isn’t beefy enough to do it so I spun up an instance on Google Compute Engine that has 356 GB of RAM and left it to work overnight. When I went to bed, it was estimating about 19 more hours to finish creating the databunch:
data_lm = (TextList.from_df(df, path=path)
.random_split_by_pct(0.1)
.label_for_lm()
.databunch(bs=80))
But I woke up this morning and it had stalled out with this error:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/site-packages/spacy/util.py", line 50, in get_lang_class
module = importlib.import_module('.lang.%s' % lang, 'spacy')
File "/opt/anaconda3/lib/python3.7/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 677, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 728, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/opt/anaconda3/lib/python3.7/site-packages/spacy/lang/en/__init__.py", line 15, in <module>
from ...language import Language
File "/opt/anaconda3/lib/python3.7/site-packages/spacy/language.py", line 15, in <module>
from .tokenizer import Tokenizer
ImportError: /opt/anaconda3/lib/python3.7/site-packages/spacy/tokenizer.cpython-37m-x86_64-linux-gnu.so: failed to map segment from shared object
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/concurrent/futures/process.py", line 232, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/opt/anaconda3/lib/python3.7/concurrent/futures/process.py", line 191, in _process_chunk
return [fn(*args) for args in chunk]
File "/opt/anaconda3/lib/python3.7/concurrent/futures/process.py", line 191, in <listcomp>
return [fn(*args) for args in chunk]
File "/opt/anaconda3/lib/python3.7/site-packages/fastai/text/transform.py", line 112, in _process_all_1
tok = self.tok_func(self.lang)
File "/opt/anaconda3/lib/python3.7/site-packages/fastai/text/transform.py", line 25, in __init__
self.tok = spacy.blank(lang)
File "/opt/anaconda3/lib/python3.7/site-packages/spacy/__init__.py", line 25, in blank
LangClass = util.get_lang_class(name)
File "/opt/anaconda3/lib/python3.7/site-packages/spacy/util.py", line 52, in get_lang_class
raise ImportError(Errors.E048.format(lang=lang))
ImportError: [E048] Can't import language en from spacy.lang.
"""
The above exception was the direct cause of the following exception:
ImportError Traceback (most recent call last)
<ipython-input-5-eea942f12256> in <module>
1 data_lm = (TextList.from_df(df, path=path)
----> 2 .random_split_by_pct(0.1)
3 .label_for_lm()
4 .databunch(bs=80))
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
439 self.valid = fv(*args, from_item_lists=True, **kwargs)
440 self.__class__ = LabelLists
--> 441 self.process()
442 return self
443 return _inner
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self)
493 "Process the inner datasets."
494 xp,yp = self.get_processors()
--> 495 for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
496 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
497 for ds in self.lists:
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self, xp, yp, name)
665 p.warns = []
666 self.x,self.y = self.x[~filt],self.y[~filt]
--> 667 self.x.process(xp)
668 return self
669
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self, processor)
72 if processor is not None: self.processor = processor
73 self.processor = listify(self.processor)
---> 74 for p in self.processor: p.process(self)
75 return self
76
/opt/anaconda3/lib/python3.7/site-packages/fastai/text/data.py in process(self, ds)
286 tokens = []
287 for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):
--> 288 tokens += self.tokenizer.process_all(ds.items[i:i+self.chunksize])
289 ds.items = tokens
290
/opt/anaconda3/lib/python3.7/site-packages/fastai/text/transform.py in process_all(self, texts)
118 if self.n_cpus <= 1: return self._process_all_1(texts)
119 with ProcessPoolExecutor(self.n_cpus) as e:
--> 120 return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])
121
122 class Vocab():
/opt/anaconda3/lib/python3.7/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable)
474 careful not to keep references to yielded objects.
475 """
--> 476 for element in iterable:
477 element.reverse()
478 while element:
/opt/anaconda3/lib/python3.7/concurrent/futures/_base.py in result_iterator()
584 # Careful not to keep a reference to the popped future
585 if timeout is None:
--> 586 yield fs.pop().result()
587 else:
588 yield fs.pop().result(end_time - time.monotonic())
/opt/anaconda3/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
423 raise CancelledError()
424 elif self._state == FINISHED:
--> 425 return self.__get_result()
426
427 self._condition.wait(timeout)
/opt/anaconda3/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ImportError: [E048] Can't import language en from spacy.lang.
I’m wondering if I might have still run out of memory. free
says I have 135GB still available but I’m not sure if it evicted anything huge after that exception was raised.
jupyter@my-fastai-instance:~$ free -mh
total used free shared buff/cache available
Mem: 307G 171G 135G 11M 91M 133G
Swap: 0B 0B 0B
Any ideas for whether there’s something else to look into before trying to double the memory size and let it run again? The other weird thing I encountered was when rebooting/resizing the instance the conda install seemed to get a bit messed up – I had to reinstall fastai and it complained about pytorch being missing for a bit before fixing itself… maybe it missed reinstalling another dependency?