I’m trying to add an enhancement to the Tokenizer that allows me to pass a “caps” flag that will tell the Tokenizer whether it should add the t_up, t_cap, and t_mx tokens or not. So my thought is that I can add the following to the proc_all_mp:
@staticmethod
def proc_all_mp(ss, lang='en', caps=True):
ncpus = num_cpus()//2
with ProcessPoolExecutor(ncpus) as e:
return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss), caps), [])
I tried to just add caps to this, but it didn’t work like I was hoping it would. Has anybody else used the map function before that knows how to pass something like this? Basically I want to just pass my bool caps
to the proc_all function so that whatever I feed into proc_all_mp will pass through and keep passing through all the way down to proc_text in text.py
Here’s the error I’m getting:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-9d41186996f0> in <module>()
----> 1 tok_lm, _ = get_texts(df_lm, 1)
2 tok_lm_val, _ = get_texts(df_lm_val, 1)
<ipython-input-13-618696e3ed78> in get_texts(df, n_lbls, caps)
16 texts = texts.apply(fixup).values.astype(str)
17
---> 18 tok = Tokenizer().proc_all_mp(partition_by_cores(texts), caps=False)
19 return tok, list(labels)
20
~/Dropbox/Projects/AskDB/fastai/text.py in proc_all_mp(ss, lang, caps)
78 ncpus = num_cpus()//2
79 with ProcessPoolExecutor(ncpus) as e:
---> 80 return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss), caps), [])
81
82
~/anaconda3/envs/fastai/lib/python3.6/concurrent/futures/process.py in map(self, fn, timeout, chunksize, *iterables)
494 results = super().map(partial(_process_chunk, fn),
495 _get_chunks(*iterables, chunksize=chunksize),
--> 496 timeout=timeout)
497 return _chain_from_iterable_of_lists(results)
498
~/anaconda3/envs/fastai/lib/python3.6/concurrent/futures/_base.py in map(self, fn, timeout, chunksize, *iterables)
573 end_time = timeout + time.time()
574
--> 575 fs = [self.submit(fn, *args) for args in zip(*iterables)]
576
577 # Yield must be hidden in closure so that the futures are submitted
~/anaconda3/envs/fastai/lib/python3.6/concurrent/futures/_base.py in <listcomp>(.0)
573 end_time = timeout + time.time()
574
--> 575 fs = [self.submit(fn, *args) for args in zip(*iterables)]
576
577 # Yield must be hidden in closure so that the futures are submitted
~/anaconda3/envs/fastai/lib/python3.6/concurrent/futures/process.py in _get_chunks(chunksize, *iterables)
135 def _get_chunks(*iterables, chunksize):
136 """ Iterates over zip()ed iterables in chunks. """
--> 137 it = zip(*iterables)
138 while True:
139 chunk = tuple(itertools.islice(it, chunksize))
TypeError: zip argument #3 must support iteration