c++ - Cuda error illegal memory referenced in devicesync and cudamemcopy -

in code create host variable

h4_in = (double*)calloc(2 * countlog, sizeof(double)); h4_out = (double*)calloc(23 * countlog, sizeof(double));

coountlog variable denotes row size of 2d array(which implementing 1d array)

//send data host in stat2 calculations (int count = 0; count < countlog; count++) {     h4_in[count * 2 + 0] = prc[count];     h4_in[count * 2 + 1] = h_stat1out[count * 6]; }

here how call cuda in main program

//free cuda memory previous call cudafree(d3_in); cudafree(d3_out); cudastatus = cudadevicereset(); if (cudastatus != cudasuccess) {     fprintf(stderr, "cudadevicereset failed :%s\n", cudageterrorstring(cudastatus)); } //send data host in stat2 calculations (int count = 0; count < countlog; count++) {     h4_in[count * 2 + 0] = prc[count];     h4_in[count * 2 + 1] = h_stat1out[count * 6]; } //query device parameters cudaoccupancymaxpotentialblocksize(&mingridsize, &threadsperblock, calcstats2, 0, countlog); // round according array size  blockspergrid = (countlog + threadsperblock - 1) / threadsperblock; //allocate memory on gpu cudastatus = cudamalloc((void **)&d4_in, 2 * countlog * sizeof(double)); if (cudastatus != cudasuccess) {     fprintf(stderr, "cudamalloc failed in kernel calcstats2 d_in :%s\n", cudageterrorstring(cudastatus)); } cudastatus = cudamalloc((void **)&d4_out, 23 * countlog * sizeof(double)); if (cudastatus != cudasuccess) {     fprintf(stderr, "cudamalloc failed in kernel calcstats2 d_out :%s\n", cudageterrorstring(cudastatus)); } //transfer array gpu cudastatus = cudamemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudamemcpyhosttodevice); if (cudastatus != cudasuccess) {     fprintf(stderr, "cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudageterrorstring(cudastatus)); } //launch threads calcstats2 <<<blockspergrid, threadsperblock>>>(d4_out, d4_in, countlog); cudastatus = cudagetlasterror(); if (cudastatus != cudasuccess) {     fprintf(stderr, "calcstats2 kernel failed: %s\n", cudageterrorstring(cudastatus)); } cudastatus = cudadevicesynchronize(); cudastatus = cudagetlasterror(); if (cudastatus != cudasuccess) {     fprintf(stderr, "device sync failed: %s\n", cudageterrorstring(cudastatus)); } //transfer data host cudastatus = cudamemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudamemcpydevicetohost); if (cudastatus != cudasuccess) {     fprintf(stderr, "cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudageterrorstring(cudastatus)); } //free cuda cudafree(d4_in); cudafree(d4_out);

the kernel call follows

__global__ void calcstats2(double *d4_out, double *d4_in, int size) {     int idx = blockdim.x*blockidx.x + threadidx.x;     double x, a, b, c, d, bx, ba, bb, bc;     if (idx < 4)     {         d4_out[idx * 23 + 0] = -1;         d4_out[idx * 23 + 1] = -1;         d4_out[idx * 23 + 2] = -1;         d4_out[idx * 23 + 3] = -1;         d4_out[idx * 23 + 4] = -1;         d4_out[idx * 23 + 5] = -1;         d4_out[idx * 23 + 6] = -1;         d4_out[idx * 23 + 7] = -1;         d4_out[idx * 23 + 8] = -1;         d4_out[idx * 23 + 9] = -1;         d4_out[idx * 23 + 10] = -1;         d4_out[idx * 23 + 11] = -1;         d4_out[idx * 23 + 12] = -1;         d4_out[idx * 23 + 13] = -1;         d4_out[idx * 23 + 14] = -1;         d4_out[idx * 23 + 15] = -1;         d4_out[idx * 23 + 16] = -1;         d4_out[idx * 23 + 17] = -1;         d4_out[idx * 23 + 18] = -1;         d4_out[idx * 23 + 19] = -1;         d4_out[idx * 23 + 20] = -1;         d4_out[idx * 23 + 21] = -1;         d4_out[idx * 23 + 22] = -1;     }     else     {         x = d4_in[idx * 2 - 8];         = d4_in[idx * 2 - 6];         b = d4_in[idx * 2 - 4];         c = d4_in[idx * 2 - 2];         d = d4_in[idx * 2 - 0];         bx = d4_in[idx * 2 - 5];         ba = d4_in[idx * 2 - 3];         bb = d4_in[idx * 2 - 1];         bc = d4_in[idx * 2 + 1];         //start stats calcs here         d4_out[idx * 23 + 0] = fabs(x - d) / fabs(a - x);         d4_out[idx * 23 + 1] = fabs(a - d) / fabs(a - x);         d4_out[idx * 23 + 2] = fabs(b - d) / fabs(c - b);         d4_out[idx * 23 + 3] = fabs(c - d) / fabs(c - b);         d4_out[idx * 23 + 4] = fabs(b - d) / fabs(a - b);         d4_out[idx * 23 + 5] = fabs(a - d) / fabs(a - b);         d4_out[idx * 23 + 6] = fabs(x - c) / fabs(a - x);         d4_out[idx * 23 + 7] = fabs(a - c) / fabs(a - x);         d4_out[idx * 23 + 8] = fabs(c - b) / fabs(a - b);         d4_out[idx * 23 + 9] = fabs(a - b) / fabs(a - x);         d4_out[idx * 23 + 10] = fabs(c - d) / fabs(a - b);         d4_out[idx * 23 + 11] = fabs(c - d) / fabs(a - x);         d4_out[idx * 23 + 12] = fabs(c - b) / fabs(a - x);         d4_out[idx * 23 + 13] = bc;         d4_out[idx * 23 + 14] = bb;         d4_out[idx * 23 + 15] = ba;         d4_out[idx * 23 + 16] = bx;         d4_out[idx * 23 + 17] = bb + bc;         d4_out[idx * 23 + 18] = ba + bb + bc;         d4_out[idx * 23 + 19] = bx + ba + bb + bc;         d4_out[idx * 23 + 20] = ba + bb;         d4_out[idx * 23 + 21] = bx + ba + bb;         d4_out[idx * 23 + 22] = bx + ba;     } }

i getting error in cudamemcppy device host , cudadevicesynchronise illegal memory access encountered. following stack overflow corrected code make 1d array, have allocated same memory both host , device arrays. strange thing

this program runs on smaller files (the input ohlc data) gives error on larger files
even larger file there 3 other kernel calls run without issue.

any appreciated.

thanks in advance

abhishek

ps using single gtx 760 card (asus vendor :https://www.asus.com/graphics-cards/gtx760dc2oc2gd5/) 2gb memory. cuda version 7. ide vs 2013.

you (possibly) launching more threads need:

 blockspergrid = (countlog + threadsperblock - 1) / threadsperblock;

and have no thread check condition in kernel. threads numbered higher countlog access arrays out-of-bounds.

try changing else statement in kernel to:

else if (idx < size)

Search This Blog

Panthy J

c++ - Cuda error illegal memory referenced in devicesync and cudamemcopy -

Comments

Post a Comment

Popular posts from this blog

asp.net - 'System.Web.HttpContext' does not contain a definition for 'GetOwinContext' Mystery -

yii2 - Yii 2 Running a Cron in the basic template -

mercurial graft feature, can it copy? -