Setup problems: Running the Lesson 1 Notebook


(Maya) #65

As a follow up to my setup issues, I think this is the relevant subtopic to post it and here is the full error stack.

 vgg = Vgg16()
 # Grab a few images at a time for training and validation. 
 # NB: They must be in subdirectories named based on their category

 batches = vgg.get_batches(path+'train', batch_size=batch_size)
 val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
 vgg.finetune(batches)
 vgg.fit(batches, val_batches, nb_epoch=1)

###Output & Error message

Found 40 images belonging to 2 classes.

['nvcc', '-shared', '-O3', '-Xlinker', '-rpath,/usr/local/cuda/lib64', '-arch=sm_61', '-m64', '-Xcompiler', '-fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker', '-rpath,/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray', '-I/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray', '-I/usr/local/cuda/include', '-I/opt/anaconda/lib/python2.7/site-packages/theano/sandbox/cuda', '-I/opt/anaconda/lib/python2.7/site-packages/numpy/core/include', '-I/opt/anaconda/include/python2.7', '-I/opt/anaconda/lib/python2.7/site-packages/theano/gof', '-L/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray', '-L/opt/anaconda/lib', '-o', '/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/tmpbDHDIA/ea4e203b6529466794536f8a1bfa77ae.so', 'mod.cu', '-lcudart', '-lcublas', '-lcuda_ndarray', '-lcudnn', '-lpython2.7']


1 #include <Python.h>
2 #include <iostream>
3 #include "theano_mod_helper.h"
4 #include "cuda_ndarray.cuh"
5 #include <math.h>
6 #include <numpy/arrayobject.h>
7 #include <numpy/arrayscalars.h>
8 #include "cudnn.h"
9 #include "cudnn_helper.h"
10 //////////////////////
11 ////  Support Code
12 //////////////////////
13 
14 void _capsule_destructor(PyObject *o) {
15     void *d = PyCapsule_GetContext(o);
16     void *p = PyCapsule_GetPointer(o, NULL);
17     void (*f)(void *) = (void (*)(void *))d;
18     if (f != NULL) f(p);
19 }
20 
21 
22 static cudnnHandle_t _handle = NULL;
23 
24 
25 static int
26 c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
27 
28   int dim = CudaNdarray_NDIM(var);
29   int *strides = (int *)malloc(dim * sizeof(int));
30   int default_str = 1;
31   int return_value = 0;
32   
33   if (strides != NULL) {
34     for (int i = dim-1; i >= 0; i--)
35     {
36       if (CudaNdarray_HOST_STRIDES(var)[i])
37         strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
38       else
39         strides[i] = default_str;
40       default_str *= CudaNdarray_HOST_DIMS(var)[i];
41     }
42     
43     cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
44                                                    CudaNdarray_HOST_DIMS(var),
45                                                    strides);
46   	 									
47     
48     if (err != CUDNN_STATUS_SUCCESS) {
49       PyErr_Format(PyExc_RuntimeError,
50 		  "Could not set tensorNd descriptor: %s"
51 		  "dim=%d",
52 		  cudnnGetErrorString(err), dim);
53 		  
54 	  return_value = -1;
55     }
56   } else {
57     PyErr_Format(PyExc_MemoryError,
58 		"Could not allocate memory for strides array of size %d.",
59 		dim);
60 		
61     return_value = -1;  
62   }
63     
64   free(strides);
65   return return_value;
66 }
67 
68 
69 static int
70 c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
71   if (!CudaNdarray_is_c_contiguous(var)) {
72     PyErr_SetString(PyExc_ValueError,
73 		    "Only contiguous filters (kernels) are supported.");
74     return -1;
75   }
76   int dim = CudaNdarray_NDIM(var);
77   cudnnStatus_t err = cudnnSetFilterNdDescriptor_v4(desc,
78                                                     CUDNN_DATA_FLOAT,
79                                                     CUDNN_TENSOR_NCHW,
80                                                     dim,
81                                                     CudaNdarray_HOST_DIMS(var));
82   if (err != CUDNN_STATUS_SUCCESS) {
83     PyErr_Format(PyExc_RuntimeError,
84 		 "Could not set filter descriptor: %s."
85 		 " dims= %d",
86 		 cudnnGetErrorString(err), dim);
87     return -1;
88   }
89   return 0;
90 }
91 
92 
93 
94     namespace {
95     struct __struct_compiled_op_ea4e203b6529466794536f8a1bfa77ae {
96         PyObject* __ERROR;
97 
98         PyObject* storage_V3;
99 PyObject* storage_V5;
100 PyObject* storage_V7;
101 PyObject* storage_V9;
102 PyObject* storage_V11;
103 PyObject* storage_V13;
104 PyObject* storage_V1;
105         
106 #define DTYPE_INPUT_0 npy_float32
107 #define TYPENUM_INPUT_0 11
108 #define ITEMSIZE_INPUT_0 4
109 #define DTYPE_INPUT_1 npy_float32
110 #define TYPENUM_INPUT_1 11
111 #define ITEMSIZE_INPUT_1 4
112 #define DTYPE_INPUT_2 npy_float32
113 #define TYPENUM_INPUT_2 11
114 #define ITEMSIZE_INPUT_2 4
115 #define DTYPE_INPUT_4 npy_float32
116 #define TYPENUM_INPUT_4 11
117 #define ITEMSIZE_INPUT_4 4
118 #define DTYPE_INPUT_5 npy_float32
119 #define TYPENUM_INPUT_5 11
120 #define ITEMSIZE_INPUT_5 4
121 #define DTYPE_OUTPUT_0 npy_float32
122 #define TYPENUM_OUTPUT_0 11
123 #define ITEMSIZE_OUTPUT_0 4
124 #define APPLY_SPECIFIC(str) str##_node_ea4e203b6529466794536f8a1bfa77ae_0
125 #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
126 #define CHOOSE_ALGO 0
127 #define CHOOSE_ALGO_ONCE 0
128 #define CHOOSE_ALGO_TIME 0
129 #define CONV_INPLACE 1
130 
131 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
132 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
133 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
134 
135 /* Keep track, from one execution to another, of the dimension of the data
136 and the algorithms, if any, that were selected according to these dimensions
137 and according to the amount of memory available at that time.
138 
139 Note : Implementation selection for backward convolution only exists starting
140 at V3.
141 */
142 int APPLY_SPECIFIC(previous_input_shape)[5];
143 int APPLY_SPECIFIC(previous_kerns_shape)[5];
144 int APPLY_SPECIFIC(previous_output_shape)[5];
145 bool APPLY_SPECIFIC(previous_algo_set);
146 cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
147 cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
148 cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
149 
150 
151 
152 int
153 APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
154                          CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
155                          float alpha, float beta, CudaNdarray **output) {
156 
157   cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
158   if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
159     PyErr_SetString(PyExc_ValueError,
160                     "GpuDnnConv images and kernel must have the same stack size\n");
161     return 1;
162   }
163 
164   int nb_dim = CudaNdarray_NDIM(input);
165 
166 #ifdef CONV_INPLACE
167   Py_XDECREF(*output);
168   *output = om;
169   Py_INCREF(*output);
170 #else
171   if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
172     return 1;
173   if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
174     return 1;
175 #endif
176 
177   if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
178     cudaError_t err2 = cudaMemset((*output)->devdata, 0,
179                                   CudaNdarray_SIZE(*output) * sizeof(real));
180     if (err2 != cudaSuccess) {
181       PyErr_Format(PyExc_RuntimeError,
182                    "GpuDnnConv could not fill the output with zeros: %s",
183                    cudaGetErrorString(err2));
184       return 1;
185     }
186     return 0;
187   }
188 
189   if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
190     return 1;
191   if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
192     return 1;
193   if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
194     return 1;
195 
196   {
197     size_t worksize;
198     void *workspace;
199     cudnnConvolutionFwdAlgo_t chosen_algo;
200 
201 
202     if (CHOOSE_ALGO)
203     {
204 
205       // A new convolution implementation should be selected, based either on
206       // timing or heuristics if in one of the two following cases :
207       // - The implementation should only be chosen during the first execution
208       //   of an apply node and this is the first execution of the apply node.
209       // - The implementation should be chosen as often as necessary and the
210       //   shapes of the inputs differ from the last time an implementation
211       //   was chosen.
212       bool reuse_previous_algo;
213       if (CHOOSE_ALGO_ONCE)
214       {
215         // Only choose a new implementation of none has been chosen before.
216         reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
217       }
218       else
219       {
220         // Reuse the previous implementation if the inputs and the kernels
221         // have the same shapes as they had when the previous implementation
222         // was selected
223         bool same_shapes = true;
224         for (int i = 0; (i < nb_dim) && same_shapes; i++)
225         {
226           same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
227                           APPLY_SPECIFIC(previous_input_shape)[i]);
228           same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
229                           APPLY_SPECIFIC(previous_kerns_shape)[i]);
230         }
231         reuse_previous_algo = same_shapes;
232       }
233 
234       // If the previously choosen implementation can't be reused, select a
235       // new one based on the shapes of the current inputs
236       if (!reuse_previous_algo)
237       {
238 
239         // Obtain a convolution algorithm appropriate for the input and kernel
240         // shapes. Either by choosing one according to heuristics or by making
241         // cuDNN time every implementation and choose the best one.
242         if (CHOOSE_ALGO_TIME)
243         {
244           // Time the different implementations to choose the best one
245           int requestedCount = 1;
246           int count;
247           cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
248           err = cudnnFindConvolutionForwardAlgorithm(_handle,
249                                                      APPLY_SPECIFIC(input),
250                                                      APPLY_SPECIFIC(kerns),
251                                                      desc,
252                                                      APPLY_SPECIFIC(output),
253                                                      requestedCount,
254                                                      &count,
255                                                      &choosen_algo_perf);
256           if (err != CUDNN_STATUS_SUCCESS) {
257             PyErr_Format(PyExc_RuntimeError,
258                          "GpuDnnConv: error selecting convolution algo: %s",
259                          cudnnGetErrorString(err));
260             return 1;
261           }
262 
263           chosen_algo = choosen_algo_perf.algo;
264         }
265         else
266         {
267           // The implementation should be chosen using heuristics based on the
268           // input shapes and the amount of memory available.
269 
270           // Get the amount of available memory
271           size_t free = 0, total = 0;
272           cudaError_t err2 = cudaMemGetInfo(&free, &total);
273           if (err2 != cudaSuccess){
274             cudaGetLastError();
275             fprintf(stderr,
276                     "Error when trying to find the memory information"
277                     " on the GPU: %s\n", cudaGetErrorString(err2));
278             return 1;
279           }
280 
281           // Use heuristics to choose the implementation
282           err = cudnnGetConvolutionForwardAlgorithm(_handle,
283                                                     APPLY_SPECIFIC(input),
284                                                     APPLY_SPECIFIC(kerns),
285                                                     desc,
286                                                     APPLY_SPECIFIC(output),
287                                                     CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
288                                                     free,
289                                                     &chosen_algo);
290 
291           if (err != CUDNN_STATUS_SUCCESS) {
292             PyErr_Format(PyExc_RuntimeError,
293                          "GpuDnnConv: error selecting convolution algo: %s",
294                          cudnnGetErrorString(err));
295             return 1;
296           }
297         }
298 
299         // Store the shapes of the inputs and kernels as well as the chosen
300         // algorithm for future use.
301         APPLY_SPECIFIC(previous_algo) = chosen_algo;
302         APPLY_SPECIFIC(previous_algo_set) = true;
303         for (int i = 0; i < nb_dim; i++)
304         {
305             APPLY_SPECIFIC(previous_input_shape)[i] =
306                                             CudaNdarray_HOST_DIMS(input)[i];
307             APPLY_SPECIFIC(previous_kerns_shape)[i] =
308                                             CudaNdarray_HOST_DIMS(kerns)[i];
309         }
310       }
311       else
312       {
313           // Reuse the previously chosen convolution implementation
314           chosen_algo = APPLY_SPECIFIC(previous_algo);
315       }
316     }
317     else
318     {
319       chosen_algo = CONV_ALGO;
320     }
321 
322     if (0){
323       char * a;
324       switch(chosen_algo){
325       case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
326 	a = "implicit gemm (0)";
327 	break;
328       case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
329 	a = "precomp gemm (1)";
330 	break;
331       case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
332 	a = "gemm (2)";
333 	break;
334       case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
335 	a = "direct (3)";
336 	break;
337       case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
338 	a = "fft (4)";
339 	break;
340       case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
341 	a = "fft tiling (5)";
342 	break;
343 #if CUDNN_VERSION > 5000
344       case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
345 	a = "winograd (6)";
346 	break;
347 #endif
348       }
349       printf("GpuDNNConv: algo %s\n", a);
350     }
351 
352     // The FFT implementation (only in V3 and onward) does not support strides,
353     // 1x1 filters or inputs with a spatial dimension larger than 1024.
354     // The tiled-FFT implementation (only in V4 onward) does not support
355     // strides.
356     // If the chosen implementation is FFT or tiled-FFT, validate that it can
357     // be used on the current data and default on a safe implementation if it
358     // can't.
359     // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
360     // defined only for 2d-filters
361     if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
362          chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
363     {
364 
365       // Extract the properties of the convolution descriptor
366       int nd;
367       int pad[2];
368       int stride[2];
369       int upscale[2];
370       cudnnConvolutionMode_t mode;
371       cudnnDataType_t data_type;
372       err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
373                                             upscale, &mode, &data_type);
374 
375       if (err != CUDNN_STATUS_SUCCESS) {
376         PyErr_Format(PyExc_RuntimeError,
377                      "GpuDnnConv: error getting convolution properties: %s",
378                      cudnnGetErrorString(err));
379         return 1;
380       }
381 
382       // Extract the spatial size of the filters
383       int filter_h = CudaNdarray_HOST_DIMS(kerns)[2];
384       int filter_w = CudaNdarray_HOST_DIMS(kerns)[3];
385 
386       // Extract the spatial size of the input
387       int input_h = CudaNdarray_HOST_DIMS(input)[2];
388       int input_w = CudaNdarray_HOST_DIMS(input)[3];
389 
390       // Ensure that the selected implementation supports the requested
391       // convolution. Fall back to a safe implementation otherwise.
392       if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
393       {
394         if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
395             input_w > 1024 || (filter_h == 1 && filter_w == 1))
396         {
397           chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
398         }
399       }
400       else
401       {
402         // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
403         if (stride[0] != 1 || stride[1] != 1)
404         {
405           chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
406         }
407       }
408     }
409 
410     err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
411                                                   APPLY_SPECIFIC(input),
412                                                   APPLY_SPECIFIC(kerns),
413                                                   desc,
414                                                   APPLY_SPECIFIC(output),
415                                                   chosen_algo,
416                                                   &worksize);
417     if (err == CUDNN_STATUS_NOT_SUPPORTED) {
418       // Fallback to none algo if not supported
419       // TODO: Print a warning
420       chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
421 
422       err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
423                                                     APPLY_SPECIFIC(input),
424                                                     APPLY_SPECIFIC(kerns),
425                                                     desc,
426                                                     APPLY_SPECIFIC(output),
427                                                     chosen_algo,
428                                                     &worksize);
429     }
430     if (err != CUDNN_STATUS_SUCCESS) {
431       PyErr_Format(PyExc_RuntimeError,
432                    "GpuDnnConv: error getting worksize: %s",
433                    cudnnGetErrorString(err));
434       return 1;
435     }
436     workspace = get_work_mem(worksize);
437     if (workspace == NULL && worksize != 0)
438       return 1;
439 
440     err = cudnnConvolutionForward(
441       _handle,
442       (void *)&alpha,
443       APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
444       APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
445       desc,
446       chosen_algo,
447       workspace, worksize,
448       (void *)&beta,
449       APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
450   }
451   if (err != CUDNN_STATUS_SUCCESS) {
452     PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
453 		 cudnnGetErrorString(err));
454     return 1;
455   }
456   return 0;
457 }
458 
459 #undef DTYPE_INPUT_0
460 #undef TYPENUM_INPUT_0
461 #undef ITEMSIZE_INPUT_0
462 #undef DTYPE_INPUT_1
463 #undef TYPENUM_INPUT_1
464 #undef ITEMSIZE_INPUT_1
465 #undef DTYPE_INPUT_2
466 #undef TYPENUM_INPUT_2
467 #undef ITEMSIZE_INPUT_2
468 #undef DTYPE_INPUT_4
469 #undef TYPENUM_INPUT_4
470 #undef ITEMSIZE_INPUT_4
471 #undef DTYPE_INPUT_5
472 #undef TYPENUM_INPUT_5
473 #undef ITEMSIZE_INPUT_5
474 #undef DTYPE_OUTPUT_0
475 #undef TYPENUM_OUTPUT_0
476 #undef ITEMSIZE_OUTPUT_0
477 #undef APPLY_SPECIFIC
478 #undef CONV_ALGO
479 #undef CHOOSE_ALGO
480 #undef CHOOSE_ALGO_ONCE
481 #undef CHOOSE_ALGO_TIME
482 #undef CONV_INPLACE
483 
484         __struct_compiled_op_ea4e203b6529466794536f8a1bfa77ae() {
485             // This is only somewhat safe because we:
486             //  1) Are not a virtual class
487             //  2) Do not use any virtual classes in the members
488             //  3) Deal with mostly POD and pointers
489 
490             // If this changes, we would have to revise this, but for
491             // now I am tired of chasing segfaults because
492             // initialization code had an error and some pointer has
493             // a junk value.
494             memset(this, 0, sizeof(*this));
495         }
496         ~__struct_compiled_op_ea4e203b6529466794536f8a1bfa77ae(void) {
497             cleanup();
498         }
499 
500         int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* storage_V11, PyObject* storage_V13, PyObject* storage_V1) {
501             Py_XINCREF(storage_V3);
502 Py_XINCREF(storage_V5);
503 Py_XINCREF(storage_V7);
504 Py_XINCREF(storage_V9);
505 Py_XINCREF(storage_V11);
506 Py_XINCREF(storage_V13);
507 Py_XINCREF(storage_V1);
508             this->storage_V3 = storage_V3;
509 this->storage_V5 = storage_V5;
510 this->storage_V7 = storage_V7;
511 this->storage_V9 = storage_V9;
512 this->storage_V11 = storage_V11;
513 this->storage_V13 = storage_V13;
514 this->storage_V1 = storage_V1;
515             
516 
517 
518 
519 
520 
521 
522 
523 
524 #define DTYPE_INPUT_0 npy_float32
525 #define TYPENUM_INPUT_0 11
526 #define ITEMSIZE_INPUT_0 4
527 #define DTYPE_INPUT_1 npy_float32
528 #define TYPENUM_INPUT_1 11
529 #define ITEMSIZE_INPUT_1 4
530 #define DTYPE_INPUT_2 npy_float32
531 #define TYPENUM_INPUT_2 11
532 #define ITEMSIZE_INPUT_2 4
533 #define DTYPE_INPUT_4 npy_float32
534 #define TYPENUM_INPUT_4 11
535 #define ITEMSIZE_INPUT_4 4
536 #define DTYPE_INPUT_5 npy_float32
537 #define TYPENUM_INPUT_5 11
538 #define ITEMSIZE_INPUT_5 4
539 #define DTYPE_OUTPUT_0 npy_float32
540 #define TYPENUM_OUTPUT_0 11
541 #define ITEMSIZE_OUTPUT_0 4
542 #define APPLY_SPECIFIC(str) str##_node_ea4e203b6529466794536f8a1bfa77ae_0
543 #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
544 #define CHOOSE_ALGO 0
545 #define CHOOSE_ALGO_ONCE 0
546 #define CHOOSE_ALGO_TIME 0
547 #define CONV_INPLACE 1
548 #define FAIL { \
549         if (!PyErr_Occurred()) { \
550             PyErr_SetString(PyExc_RuntimeError, \
551                 "Unexpected error in an Op's C code. " \
552                 "No Python exception was set."); \
553             } \
554         return 15; \
555 

/opt/anaconda/lib/python2.7/site-packages/theano/gof/cmodule.pyc in module_from_key(self, key, lnk, keep_lock)
   1157             try:
   1158                 location = dlimport_workdir(self.dirname)
-> 1159                 module = lnk.compile_cmodule(location)
   1160                 name = module.__file__
   1161                 assert name.startswith(location)

/opt/anaconda/lib/python2.7/site-packages/theano/gof/cc.pyc in compile_cmodule(self, location)
   1487                 lib_dirs=self.lib_dirs(),
   1488                 libs=libs,
-> 1489                 preargs=preargs)
   1490         except Exception as e:
   1491             e.args += (str(self.fgraph),)

/opt/anaconda/lib/python2.7/site-packages/theano/sandbox/cuda/nvcc_compiler.pyc in compile_str(module_name, src_code, location, include_dirs, lib_dirs, libs, preargs, rpaths, py_module, hide_symbols)
    403             print(cmd)
    404             raise Exception('nvcc return status', p.returncode,
--> 405                             'for cmd', ' '.join(cmd))
    406         elif config.cmodule.compilation_warning and nvcc_stdout:
    407             print(nvcc_stdout)

Exception: ('The following error happened while compiling the node', GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv', precision='float32'}.0, Constant{1.0}, Constant{0.0}), '\n', 'nvcc return status', 2, 'for cmd', 'nvcc -shared -O3 -Xlinker -rpath,/usr/local/cuda/lib64 -arch=sm_61 -m64 -Xcompiler -fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray -I/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray -I/usr/local/cuda/include -I/opt/anaconda/lib/python2.7/site-packages/theano/sandbox/cuda -I/opt/anaconda/lib/python2.7/site-packages/numpy/core/include -I/opt/anaconda/include/python2.7 -I/opt/anaconda/lib/python2.7/site-packages/theano/gof -L/home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/cuda_ndarray -L/opt/anaconda/lib -o /home/ra/.theano/compiledir_Linux-4.8--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.13-64/tmpbDHDIA/ea4e203b6529466794536f8a1bfa77ae.so mod.cu -lcudart -lcublas -lcuda_ndarray -lcudnn -lpython2.7', "[GpuDnnConv{algo='small', inplace=True}(<CudaNdarrayType(float32, 4D)>, <CudaNdarrayType(float32, 4D)>, <CudaNdarrayType(float32, 4D)>, <CDataType{cudnnConvolutionDescriptor_t}>, Constant{1.0}, Constant{0.0})]")

End of Error/Exception

This is with python 2.7, keras 1.1.2

Don’t know how to fix this. Sorry for the repeated posts Need help @jeremy


(Maya) #66

Ok, it was something to do with cuDNN version. I re-installed it (downgraded) to 5.1 and things seem to running at least. Am happy it is finally running. I have been wanting to get to this point for many days now (ending up with erro and then troubleshooting which adds to the barrier a lot)


(Phil) #67

thanks for this fix - works perfectly


(Yuzhou Liu) #68

@rachel

I am trying to get the setup to work on Windows with python3. (It seems tensorflow on windows only works with python3. )

I have created a conda environment and verified running python interactive prompt and tried importing tensorflow and print hello world message with a constant tensor did work fine.

However when i try to execute this line in lesson1’s notebook I get the following error. Any idea why?

[I 21:41:47.212 NotebookApp] Adapting to protocol v5.1 for kernel 6760032e-a017-417c-84c3-33c5f9737ba8
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:126] Couldn’t open CUDA library cublas64_80.dll
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_blas.cc:2294] Unable to load cuBLAS DSO.
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:126] Couldn’t open CUDA library cudnn64_5.dll
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:3517] Unable to load cuDNN DSO
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:126] Couldn’t open CUDA library cufft64_80.dll
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_fft.cc:344] Unable to load cuFFT DSO.
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] successfully opened CUDA library nvcuda.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:126] Couldn’t open CUDA library curand64_80.dll
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_rng.cc:338] Unable to load cuRAND DSO.
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “BestSplits” device_type: “CPU”’) for unknown op: BestSplits
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “CountExtremelyRandomStats” device_type: “CPU”’) for unknown op: CountExtremelyRandomStats
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “FinishedNodes” device_type: “CPU”’) for unknown op: FinishedNodes
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “GrowTree” device_type: “CPU”’) for unknown op: GrowTree
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “ReinterpretStringToFloat” device_type: “CPU”’) for unknown op: ReinterpretStringToFloat
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “SampleInputs” device_type: “CPU”’) for unknown op: SampleInputs
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “ScatterAddNdim” device_type: “CPU”’) for unknown op: ScatterAddNdim
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “TopNInsert” device_type: “CPU”’) for unknown op: TopNInsert
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “TopNRemove” device_type: “CPU”’) for unknown op: TopNRemove
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “TreePredictions” device_type: “CPU”’) for unknown op: TreePredictions
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel (‘op: “UpdateFertileSlots” device_type: “CPU”’) for unknown op: UpdateFertileSlots
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:885] Found device 0 with properties:
name: GeForce GTX 965M
major: 5 minor: 2 memoryClockRate (GHz) 1.15
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.64GiB
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:906] DMA: 0
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:916] 0: Y
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 965M, pci bus id: 0000:01:00.0)
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:222] Check failed: s.ok() could not find cudnnCreate in cudnn DSO; dlerror: cudnnCreate not found


(taewoo) #69

Thanks @chandanpanda. Worked well.

BTW, in case anyone anyone is attempting to this on CPU…

I’m running this on VMware player 16 w/ Ubuntu 16.04 and all the latest modules (as of 10/7/2017) including Keras 2.0 and theano 0.9. CPU is Intel i7-4700mq 2.4 ghz 4 cores, 8 core logical

1 epoch is taking 13,977s … about 4 hours.

Now you know why nvidia stock 3x’d in past few years.


(Ajit) #70

Hello All,

I built a new desktop with a NVIDIA 1080TI graphics processor. I followed the instructions provided the following fastai forum to set up fastai to uset the local gpu. However, i noticed that upon running Lesson 1 code, only my CPU was being used. What additional steps may be required to engage my GPU? Suggestions or pointers to other discussions would help.

Thanks,

  • Ajit