Ran into a new driver issue on AWS (SOLVED)

I am running the following code and getting the error below. Training is running without error, but doesn’t look like my GPU is being hit. It was working fine yesterday…do i need to downgrade pytorch? my veraion is 0.3.0.post4

from fastai.conv_learner import *
from planet import f2

PATH = 'data/shopstyle/'

f_model = resnet34

def get_data(sz):
    tfms = tfms_from_model(f_model, sz, aug_tfms=transforms_side_on, max_zoom=1.05)
    return ImageClassifierData.from_csv(PATH, 'train', label_csv, tfms=tfms, suffix='.jpg', val_idxs=val_idxs, test_name='test')

def print_list(list_or_iterator):
        return "[" + ", ".join( str(x) for x in list_or_iterator) + "]"

label_csv = f'{PATH}prod_train.csv'
n = len(list(open(label_csv)))-1
val_idxs = get_cv_idxs(n)

sz = 64
data = get_data(sz)

learn = ConvLearner.pretrained(f_model, data, metrics=metrics)

trn_tfms, val_tfrms = tfms_from_model(f_model, sz)
im = val_tfrms(open_image(f'{PATH}/valid/4500132.jpg'))
preds = learn.predict_array(im[None])
p=list(zip(data.classes, preds))
print("predictions = " + print_list(p))

AssertionError                            Traceback (most recent call last)
<ipython-input-2-75349f9290f0> in <module>()
     29 trn_tfms, val_tfrms = tfms_from_model(f_model, sz)
     30 im = val_tfrms(open_image(f'{PATH}/valid/4500132.jpg'))
---> 31 preds = learn.predict_array(im[None])
     32 p=list(zip(data.classes, preds))
     33 print("predictions = " + print_list(p))

~/fastai/courses/dl1/fastai/learner.py in predict_array(self, arr)
    265     def predict_dl(self, dl): return predict_with_targs(self.model, dl)[0]
--> 266     def predict_array(self, arr): return to_np(self.model(V(T(arr).cuda())))
    268     def TTA(self, n_aug=4, is_test=False):

~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/_utils.py in _cuda(self, device, async)
     67         else:
     68             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 69             return new_type(self.size()).copy_(self, async)

~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
    356 @staticmethod
    357 def _lazy_new(cls, *args, **kwargs):
--> 358     _lazy_init()
    359     # We need this method only for lazy init, so we can remove it
    360     del _CudaBase.__new__

~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in _lazy_init()
    118         raise RuntimeError(
    119             "Cannot re-initialize CUDA in forked subprocess. " + msg)
--> 120     _check_driver()
    121     torch._C._cuda_init()
    122     torch._C._cuda_sparse_init()

~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in _check_driver()
     69 Alternatively, go to: http://pytorch.org to install
     70 a PyTorch version that has been compiled with your version
---> 71 of the CUDA driver.""".format(str(torch._C._cuda_getDriverVersion())))

The NVIDIA driver on your system is too old (found version 8000).
Please update your GPU driver by downloading and installing a new
version from the URL: http://www.nvidia.com/Download/index.aspx
Alternatively, go to: http://pytorch.org to install
a PyTorch version that has been compiled with your version
of the CUDA driver.
More info:

# packages in environment at /home/ubuntu/src/anaconda3/envs/fastai: 
cuda90                    1.0                  h6433d27_0    pytorch
pytorch                   0.3.0           py36_cuda9.0.176_cudnn7.0.3hdc18817_4  [cuda90]  pytorch
torchvision               0.2.0            py36h17b6947_1    pytorch

I updated the nvidia driver on the instance and now things are happy…and so much faster. :slight_smile:

@binarypoet - How u updated the nvidia drivers on aws? I don’t want to mess up. So little direction?

@binarypoet I have the same issue. Can you please share details on how you resolved it.

I’m having this issue on SageMaker. Any idea how to fix?

