c++ - Cuda error illegal memory referenced in devicesync and cudamemcopy -
in code create host variable
h4_in = (double*)calloc(2 * countlog, sizeof(double)); h4_out = (double*)calloc(23 * countlog, sizeof(double));
coountlog variable denotes row size of 2d array(which implementing 1d array)
//send data host in stat2 calculations (int count = 0; count < countlog; count++) { h4_in[count * 2 + 0] = prc[count]; h4_in[count * 2 + 1] = h_stat1out[count * 6]; }
here how call cuda in main program
//free cuda memory previous call cudafree(d3_in); cudafree(d3_out); cudastatus = cudadevicereset(); if (cudastatus != cudasuccess) { fprintf(stderr, "cudadevicereset failed :%s\n", cudageterrorstring(cudastatus)); } //send data host in stat2 calculations (int count = 0; count < countlog; count++) { h4_in[count * 2 + 0] = prc[count]; h4_in[count * 2 + 1] = h_stat1out[count * 6]; } //query device parameters cudaoccupancymaxpotentialblocksize(&mingridsize, &threadsperblock, calcstats2, 0, countlog); // round according array size blockspergrid = (countlog + threadsperblock - 1) / threadsperblock; //allocate memory on gpu cudastatus = cudamalloc((void **)&d4_in, 2 * countlog * sizeof(double)); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamalloc failed in kernel calcstats2 d_in :%s\n", cudageterrorstring(cudastatus)); } cudastatus = cudamalloc((void **)&d4_out, 23 * countlog * sizeof(double)); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamalloc failed in kernel calcstats2 d_out :%s\n", cudageterrorstring(cudastatus)); } //transfer array gpu cudastatus = cudamemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudamemcpyhosttodevice); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudageterrorstring(cudastatus)); } //launch threads calcstats2 <<<blockspergrid, threadsperblock>>>(d4_out, d4_in, countlog); cudastatus = cudagetlasterror(); if (cudastatus != cudasuccess) { fprintf(stderr, "calcstats2 kernel failed: %s\n", cudageterrorstring(cudastatus)); } cudastatus = cudadevicesynchronize(); cudastatus = cudagetlasterror(); if (cudastatus != cudasuccess) { fprintf(stderr, "device sync failed: %s\n", cudageterrorstring(cudastatus)); } //transfer data host cudastatus = cudamemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudamemcpydevicetohost); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudageterrorstring(cudastatus)); } //free cuda cudafree(d4_in); cudafree(d4_out);
the kernel call follows
__global__ void calcstats2(double *d4_out, double *d4_in, int size) { int idx = blockdim.x*blockidx.x + threadidx.x; double x, a, b, c, d, bx, ba, bb, bc; if (idx < 4) { d4_out[idx * 23 + 0] = -1; d4_out[idx * 23 + 1] = -1; d4_out[idx * 23 + 2] = -1; d4_out[idx * 23 + 3] = -1; d4_out[idx * 23 + 4] = -1; d4_out[idx * 23 + 5] = -1; d4_out[idx * 23 + 6] = -1; d4_out[idx * 23 + 7] = -1; d4_out[idx * 23 + 8] = -1; d4_out[idx * 23 + 9] = -1; d4_out[idx * 23 + 10] = -1; d4_out[idx * 23 + 11] = -1; d4_out[idx * 23 + 12] = -1; d4_out[idx * 23 + 13] = -1; d4_out[idx * 23 + 14] = -1; d4_out[idx * 23 + 15] = -1; d4_out[idx * 23 + 16] = -1; d4_out[idx * 23 + 17] = -1; d4_out[idx * 23 + 18] = -1; d4_out[idx * 23 + 19] = -1; d4_out[idx * 23 + 20] = -1; d4_out[idx * 23 + 21] = -1; d4_out[idx * 23 + 22] = -1; } else { x = d4_in[idx * 2 - 8]; = d4_in[idx * 2 - 6]; b = d4_in[idx * 2 - 4]; c = d4_in[idx * 2 - 2]; d = d4_in[idx * 2 - 0]; bx = d4_in[idx * 2 - 5]; ba = d4_in[idx * 2 - 3]; bb = d4_in[idx * 2 - 1]; bc = d4_in[idx * 2 + 1]; //start stats calcs here d4_out[idx * 23 + 0] = fabs(x - d) / fabs(a - x); d4_out[idx * 23 + 1] = fabs(a - d) / fabs(a - x); d4_out[idx * 23 + 2] = fabs(b - d) / fabs(c - b); d4_out[idx * 23 + 3] = fabs(c - d) / fabs(c - b); d4_out[idx * 23 + 4] = fabs(b - d) / fabs(a - b); d4_out[idx * 23 + 5] = fabs(a - d) / fabs(a - b); d4_out[idx * 23 + 6] = fabs(x - c) / fabs(a - x); d4_out[idx * 23 + 7] = fabs(a - c) / fabs(a - x); d4_out[idx * 23 + 8] = fabs(c - b) / fabs(a - b); d4_out[idx * 23 + 9] = fabs(a - b) / fabs(a - x); d4_out[idx * 23 + 10] = fabs(c - d) / fabs(a - b); d4_out[idx * 23 + 11] = fabs(c - d) / fabs(a - x); d4_out[idx * 23 + 12] = fabs(c - b) / fabs(a - x); d4_out[idx * 23 + 13] = bc; d4_out[idx * 23 + 14] = bb; d4_out[idx * 23 + 15] = ba; d4_out[idx * 23 + 16] = bx; d4_out[idx * 23 + 17] = bb + bc; d4_out[idx * 23 + 18] = ba + bb + bc; d4_out[idx * 23 + 19] = bx + ba + bb + bc; d4_out[idx * 23 + 20] = ba + bb; d4_out[idx * 23 + 21] = bx + ba + bb; d4_out[idx * 23 + 22] = bx + ba; } }
i getting error in cudamemcppy device host , cudadevicesynchronise illegal memory access encountered. following stack overflow corrected code make 1d array, have allocated same memory both host , device arrays. strange thing
this program runs on smaller files (the input ohlc data) gives error on larger files
even larger file there 3 other kernel calls run without issue.
any appreciated.
thanks in advance
abhishek
ps using single gtx 760 card (asus vendor :https://www.asus.com/graphics-cards/gtx760dc2oc2gd5/) 2gb memory. cuda version 7. ide vs 2013.
you (possibly) launching more threads need:
blockspergrid = (countlog + threadsperblock - 1) / threadsperblock;
and have no thread check condition in kernel. threads numbered higher countlog
access arrays out-of-bounds.
try changing else statement in kernel to:
else if (idx < size)
Comments
Post a Comment