Hi,
I am trying to setup the labs in my local ubuntu desktop 16.04 LTS
Notebook for lesson 1 worked when theano used ‘CPU’ mode.
When i changed theano to ‘GPU’, I am receiving below error when i run this cell.
code:
import utils; reload(utils)
from utils import plots
error:
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end(gpuarray)
Using gpu device 0: GeForce GT 520MX (CNMeM is enabled with initial size: 70.0% of memory, cuDNN Device not supported)
1 #include <Python.h>
2 #include <iostream>
3 #include "theano_mod_helper.h"
4 #include "cuda_ndarray.cuh"
5 //////////////////////
6 //// Support Code
7 //////////////////////
8
9 static __global__ void kernel_reduce_ccontig_node_m23eb44cb0bd6afa880a56f8b1f6a7d7f_0(
10 const unsigned int d0,
11 const float *A,
12 float * Z)
13 {
14 const int threadCount = blockDim.x;
15 const int threadNum = threadIdx.x;
16 extern __shared__ float buf[];
17 float myresult = 0;
18
19 if (warpSize != 32)
20 {
21 return; //TODO: set error code
22 }
23
24 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
25 {
26 myresult = myresult + A[i0];
27 }
28
29 __syncthreads(); // some kernel do multiple reduction.
30 buf[threadNum] = myresult;
31 __syncthreads();
32
33 // rest of function is handled by one warp
34 if (threadNum < warpSize)
35 {
36 //round up all the partial sums into the first `warpSize` elements
37 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
38 {
39 myresult = myresult + buf[i];
40 }
41 buf[threadNum] = myresult;
42 /*Comment this optimization as it don't work on Fermi GPU.
43 TODO: find why it don't work or put the GPU compute capability into the version
44 // no sync because only one warp is running
45 if(threadCount >32)
46 {buf[threadNum] = buf[threadNum] + buf[threadNum+16];buf[threadNum] = buf[threadNum] + buf[threadNum+8];buf[threadNum] = buf[threadNum] + buf[threadNum+4];buf[threadNum] = buf[threadNum] + buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
47 if (threadNum == 0)
48 {
49 Z[0] = buf[0];
50 }
51
52 }
53 else */
54 if (threadNum < 16)
55 {
56 //reduce so that threadNum 0 has the reduction of everything
57 if (threadNum + 16 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if (threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+1];
58 if (threadNum == 0)
59 {
60 Z[0] = buf[0];
61 }
62 }
63 }
64
65 }
66
67
68 static __global__ void kernel_reduce_1_node_m23eb44cb0bd6afa880a56f8b1f6a7d7f_0(
69 const unsigned int d0,
70 const float *A, const int sA0,
71 float * Z)
72 {
73 const int threadCount = blockDim.x;
74 const int threadNum = threadIdx.x;
75 extern __shared__ float buf[];
76 float myresult = 0;
77
78 if (warpSize != 32)
79 {
80 return; //TODO: set error code
81 }
82
83 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
84 {
85 myresult = myresult + A[i0 * sA0];
86 }
87
88 __syncthreads(); // some kernel do multiple reduction.
89 buf[threadNum] = myresult;
90 __syncthreads();
91
92 // rest of function is handled by one warp
93 if (threadNum < warpSize)
94 {
95 //round up all the partial sums into the first `warpSize` elements
96 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
97 {
98 myresult = myresult + buf[i];
99 }
100 buf[threadNum] = myresult;
101 /*Comment this optimization as it don't work on Fermi GPU.
102 TODO: find why it don't work or put the GPU compute capability into the version
103 // no sync because only one warp is running
104 if(threadCount >32)
105 {buf[threadNum] = buf[threadNum] + buf[threadNum+16];buf[threadNum] = buf[threadNum] + buf[threadNum+8];buf[threadNum] = buf[threadNum] + buf[threadNum+4];buf[threadNum] = buf[threadNum] + buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
106 if (threadNum == 0)
107 {
108 Z[0] = buf[0];
109 }
110
111 }
112 else */
113 if (threadNum < 16)
114 {
115 //reduce so that threadNum 0 has the reduction of everything
116 if (threadNum + 16 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if (threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+1];
117 if (threadNum == 0)
118 {
119 Z[0] = buf[0];
.
.
.
.
.
.
.
593 //////////////////////
594 //// Functions
595
['nvcc', '-shared', '-O3', '-arch=sm_21', '-m64', '-Xcompiler', '-fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker',