Please help: local ubuntu setup problem : cuDNN Device not supported

Hi,
I am trying to setup the labs in my local ubuntu desktop 16.04 LTS
Notebook for lesson 1 worked when theano used ‘CPU’ mode.
When i changed theano to ‘GPU’, I am receiving below error when i run this cell.
code:
import utils; reload(utils)
from utils import plots

error:
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end(gpuarray)

    Using gpu device 0: GeForce GT 520MX (CNMeM is enabled with initial size: 70.0% of memory, cuDNN Device not supported)
    1 #include <Python.h>
    2 #include <iostream>
    3 #include "theano_mod_helper.h"
    4 #include "cuda_ndarray.cuh"
    5 //////////////////////
    6 ////  Support Code
    7 //////////////////////
    8 
    9             static __global__ void kernel_reduce_ccontig_node_m23eb44cb0bd6afa880a56f8b1f6a7d7f_0(
    10                     const unsigned int d0,
    11                     const float *A,
    12                     float * Z)
    13             {
    14                 const int threadCount = blockDim.x;
    15                 const int threadNum = threadIdx.x;
    16                 extern __shared__ float buf[];
    17                 float myresult = 0;
    18 
    19                 if (warpSize != 32)
    20                 {
    21                     return;  //TODO: set error code
    22                 }
    23 
    24                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
    25                 {
    26                     myresult = myresult + A[i0];
    27                 }
    28                 
    29         __syncthreads(); // some kernel do multiple reduction.
    30         buf[threadNum] = myresult;
    31         __syncthreads();
    32 
    33         // rest of function is handled by one warp
    34         if (threadNum < warpSize)
    35         {
    36             //round up all the partial sums into the first `warpSize` elements
    37             for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
    38             {
    39                 myresult = myresult + buf[i];
    40             }
    41             buf[threadNum] = myresult;
    42         /*Comment this optimization as it don't work on Fermi GPU.
    43         TODO: find why it don't work or put the GPU compute capability into the version
    44             // no sync because only one warp is running
    45             if(threadCount >32)
    46             {buf[threadNum] = buf[threadNum] + buf[threadNum+16];buf[threadNum] = buf[threadNum] + buf[threadNum+8];buf[threadNum] = buf[threadNum] + buf[threadNum+4];buf[threadNum] = buf[threadNum] + buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
    47                 if (threadNum == 0)
    48                 {
    49                     Z[0] = buf[0];
    50                 }
    51 
    52             }
    53             else */
    54             if (threadNum < 16)
    55             {
    56                 //reduce so that threadNum 0 has the reduction of everything
    57                 if (threadNum + 16 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if (threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+1];
    58                 if (threadNum == 0)
    59                 {
    60                     Z[0] = buf[0];
    61                 }
    62             }
    63         }
    64         
    65             }
    66             
    67 
    68             static __global__ void kernel_reduce_1_node_m23eb44cb0bd6afa880a56f8b1f6a7d7f_0(
    69                     const unsigned int d0,
    70                     const float *A, const int sA0,
    71                     float * Z)
    72             {
    73                 const int threadCount = blockDim.x;
    74                 const int threadNum = threadIdx.x;
    75                 extern __shared__ float buf[];
    76                 float myresult = 0;
    77 
    78                 if (warpSize != 32)
    79                 {
    80                     return;  //TODO: set error code
    81                 }
    82 
    83                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
    84                 {
    85                     myresult = myresult + A[i0 * sA0];
    86                 }
    87                 
    88         __syncthreads(); // some kernel do multiple reduction.
    89         buf[threadNum] = myresult;
    90         __syncthreads();
    91 
    92         // rest of function is handled by one warp
    93         if (threadNum < warpSize)
    94         {
    95             //round up all the partial sums into the first `warpSize` elements
    96             for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
    97             {
    98                 myresult = myresult + buf[i];
    99             }
    100             buf[threadNum] = myresult;
    101         /*Comment this optimization as it don't work on Fermi GPU.
    102         TODO: find why it don't work or put the GPU compute capability into the version
    103             // no sync because only one warp is running
    104             if(threadCount >32)
    105             {buf[threadNum] = buf[threadNum] + buf[threadNum+16];buf[threadNum] = buf[threadNum] + buf[threadNum+8];buf[threadNum] = buf[threadNum] + buf[threadNum+4];buf[threadNum] = buf[threadNum] + buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
    106                 if (threadNum == 0)
    107                 {
    108                     Z[0] = buf[0];
    109                 }
    110 
    111             }
    112             else */
    113             if (threadNum < 16)
    114             {
    115                 //reduce so that threadNum 0 has the reduction of everything
    116                 if (threadNum + 16 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if (threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+1];
    117                 if (threadNum == 0)
    118                 {
    119                     Z[0] = buf[0];
    .
.
.
.
.
.
.    
    593 //////////////////////
    594 ////  Functions
    595


    ['nvcc', '-shared', '-O3', '-arch=sm_21', '-m64', '-Xcompiler', '-fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker',

found that NVIDIA Card GeForce GT 520MX is cuda arch 2.1.
And cudNN needs cuda arch 3.0 capability

1 Like