It seems the worker threads are dying unless they are wrapped in a “if name == ‘main’:” The error below suggest I should be using fork, but that isn’t applicable to Windows as far as I am aware. I’m wondering if the ‘IS_WINDOWS’ part of https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/worker.py isn’t getting used - or has a bug. I’ll ask around at work to see if anyone knows a way around this.
I could repro and see the more complete error and also fix in a py program, but not sure how to get this running in Jupyter.
My program that runs ok was based on notebook 3 - as I could repro in the PyTorcj Dataloader section if I used num_workers > 0.:
from nb_03_bs import *
import torch.nn.functional as F
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import numpy as np
class Dataset():
def init(self, x, y): self.x,self.y = x,y
def len(self): return len(self.x)
def getitem(self, i): return self.x[i],self.y[i]
def fit():
for epoch in range(epochs):
for xb,yb in train_dl:
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
def get_model():
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
return model, optim.SGD(model.parameters(), lr=lr)
def collate(b):
xs,ys = zip(*b)
return torch.stack(xs),torch.stack(ys)
bs=64
mpl.rcParams[‘image.cmap’] = ‘gray’
x_train,y_train,x_valid,y_valid = get_data()
n,m = x_train.shape
c = y_train.max()+1
nh = 50
lr = 0.5
epochs = 1
loss_func = F.cross_entropy
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)
if name == ‘main’:
train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate, num_workers=4)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate, num_workers=4)
xb,yb = next(iter(train_dl))
model,opt = get_model()
fit()
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
print(loss)
print(acc)
If I exclude the if name == ‘main’: the error I get is:
(fastai-partdeux) D:\OneDrive\AI\fastai_docs\dev_course\dl2\exp>cd d:\OneDrive\AI\fastai_docs\dev_course\dl2\exp && cmd /C "set “PYTHONIOENCODING=UTF-8” && set “PYTHONUNBUFFERED=1” && C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\python.exe c:\Users\bsmi0.vscode\extensions\ms-python.python-2019.3.6558\pythonFiles\ptvsd_launcher.py --default --client --host localhost --port 50869 d:\OneDrive\AI\fastai_docs\dev_course\dl2\exp\standalone.py "
Traceback (most recent call last):
File “”, line 1, in
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 105,
in spawn_main
exitcode = _main(fd)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 114,
in _main
prepare(preparation_data)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 225,
in prepare
_fixup_main_from_path(data[‘init_main_from_path’])
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 277,
in _fixup_main_from_path
run_name=“mp_main”)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 85, in _run_code
exec(code, run_globals)
File “d:\OneDrive\AI\fastai_docs\dev_course\dl2\exp\standalone.py”, line 46, in
xb,yb = next(iter(train_dl))
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 162, in iter
return _DataLoaderIter(self)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 438, in init
w.start()
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\process.py”, line 112, in start
self._popen = self._Popen(self)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\context.py”, line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\context.py”, line 322, in _Popen
return Popen(process_obj)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\popen_spawn_win32.py”, line 46, in init
prep_data = spawn.get_preparation_data(process_obj._name)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 143,
in get_preparation_data
_check_not_importing_main()
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\spawn.py”, line 136,
in _check_not_importing_main
is not going to be frozen to produce an executable.’’’)
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Traceback (most recent call last):
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 480, in _try_get_batch
data = self.data_queue.get(timeout=timeout)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\multiprocessing\queues.py”, line 105, in get
raise Empty
_queue.Empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “c:\Users\bsmi0.vscode\extensions\ms-python.python-2019.3.6558\pythonFiles\ptvsd_launcher.py”, line 45, in
main(ptvsdArgs)
File “c:\Users\bsmi0.vscode\extensions\ms-python.python-2019.3.6558\pythonFiles\lib\python\ptvsd_main_.py”, line 391, in main
run()
File “c:\Users\bsmi0.vscode\extensions\ms-python.python-2019.3.6558\pythonFiles\lib\python\ptvsd_main_.py”, line 272, in run_file
runpy.run_path(target, run_name=‘main’)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\runpy.py”, line 85, in _run_code
exec(code, run_globals)
File “d:\OneDrive\AI\fastai_docs\dev_course\dl2\exp\standalone.py”, line 46, in
xb,yb = next(iter(train_dl))
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 545, in next
idx, batch = self._get_batch()
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 522, in _get_batch
success, data = self._try_get_batch()
File “C:\Users\bsmi0\Anaconda3\envs\fastai-partdeux\lib\site-packages\torch\utils\data\dataloader.py”, line 488, in _try_get_batch
raise RuntimeError(‘DataLoader worker (pid(s) {}) exited unexpectedly’.format(pids_str))
RuntimeError: DataLoader worker (pid(s) 30832) exited unexpectedly