I’m getting the same error message using TextLMDataBunch.from_csv() to try loading a 4.5GB file, though I have only 62GB Ram. It seems to use all the RAM during the attempted load, and only crashes after 1-2 hours of trying, with the progress bar showing over 50% loaded. It is also using all 6 cores at 100% each, of my single Intel CPU. The command didn’t fail when I limited my input file to the first 100 lines. Here is my command and traceback:
This is the code that failed
path = datapath4file(’/media/DataHD2/Notes_PHI_20190121/notes_dana_hp’)
data_lm = TextLMDataBunch.from_csv(path=path, csv_name=‘notes_hp_half.txt’, text_cols=‘note_text’,
header=0)
This is the traceback when it crashed
BrokenProcessPool Traceback (most recent call last)
in
4 path = datapath4file(’/media/DataHD2/Notes_PHI_20190121/notes_dana_hp’)
5 data_lm = TextLMDataBunch.from_csv(path=path, csv_name=‘notes_hp_half.txt’, text_cols=‘note_text’,
----> 6 header=0)
7 # notes_hp_half.txt = 2,600,000 rows; 4,382,461,616 bytes
8 # expected time: 1 hr 48 min
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/data.py in from_csv(cls, path, csv_name, valid_pct, test, tokenizer, vocab, classes, header, text_cols, label_cols, label_delim, **kwargs)
219 test_df = None if test is None else pd.read_csv(Path(path)/test, header=header)
220 return cls.from_df(path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols,
–> 221 label_cols, label_delim, **kwargs)
222
223 @classmethod
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, **kwargs)
203 src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
204 TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
–> 205 if cls==TextLMDataBunch: src = src.label_for_lm()
206 else: src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
207 if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
423 self.valid = fv(*args, **kwargs)
424 self.class = LabelLists
–> 425 self.process()
426 return self
427 return _inner
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/data_block.py in process(self)
470 “Process the inner datasets.”
471 xp,yp = self.get_processors()
–> 472 for ds,n in zip(self.lists, [‘train’,‘valid’,‘test’]): ds.process(xp, yp, name=n)
473 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
474 for ds in self.lists:
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/data_block.py in process(self, xp, yp, name)
625 p.warns = []
626 self.x,self.y = self.x[~filt],self.y[~filt]
–> 627 self.x.process(xp)
628 return self
629
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/data_block.py in process(self, processor)
66 if processor is not None: self.processor = processor
67 self.processor = listify(self.processor)
—> 68 for p in self.processor: p.process(self)
69 return self
70
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/data.py in process(self, ds)
283 tokens = []
284 for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):
–> 285 tokens += self.tokenizer.process_all(ds.items[i:i+self.chunksize])
286 ds.items = tokens
287
~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/transform.py in process_all(self, texts)
114 if self.n_cpus <= 1: return self._process_all_1(texts)
115 with ProcessPoolExecutor(self.n_cpus) as e:
–> 116 return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])
117
118 class Vocab():
~/anaconda3/envs/fastaiv1/lib/python3.6/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable)
364 careful not to keep references to yielded objects.
365 “”"
–> 366 for element in iterable:
367 element.reverse()
368 while element:
~/anaconda3/envs/fastaiv1/lib/python3.6/concurrent/futures/_base.py in result_iterator()
584 # Careful not to keep a reference to the popped future
585 if timeout is None:
–> 586 yield fs.pop().result()
587 else:
588 yield fs.pop().result(end_time - time.monotonic())
~/anaconda3/envs/fastaiv1/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
–> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~/anaconda3/envs/fastaiv1/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
–> 384 raise self._exception
385 else:
386 return self._result
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.