cuda - cublasDtrsm after LU with pivoting -


i stuck @ small problem. i've got solve linear system a * x = b.

the matrix a gets decomposed lu-factorization (lapack). result factorized matrix , pivotarray. after want solve 2 linear systems: u * x = y , l * y = b on gpu *cublasdtrsm*. because of row interchanges dgetrf in lapack have pass pivot array cublas. *cublasdtrsm*-function don't offers this. without pivot array wrong results.

i searched disabling pivoting in lapack, regarding stability it's not possible. there hint how solve linear equation system lu-factorization?

if wanted use particular approach (cublas trsm after lapack getrf), believe should able use cublas trsm l,u output of lapack rearranging b vector (or matrix) match rearrangement order lapack performed during pivoting. believe order given in formula ipiv in lapack documentation:

ipiv
ipiv integer array, dimension (min(m,n)) pivot indices; 1 <= <= min(m,n), row of matrix interchanged row ipiv(i).

here's sample code demonstrates idea simple 3x3 test case single rhs vector:

$ cat t853.cu #include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h>  #define cudacall(call)                                                                                                          \                                                                                                                              \     {                                                                                                                           \         cudaerror_t err = (call);                                                                                               \         if(cudasuccess != err)                                                                                                  \         {                                                                                                                       \             fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err));    \             cudadevicereset();                                                                                                  \             exit(exit_failure);                                                                                                 \         }                                                                                                                       \     }                                                                                                                           \     while (0)  #define cublascall(call)                                                                                        \                                                                                                              \     {                                                                                                           \         cublasstatus_t status = (call);                                                                         \         if(cublas_status_success != status)                                                                     \         {                                                                                                       \             fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status);     \             cudadevicereset();                                                                                  \             exit(exit_failure);                                                                                 \         }                                                                                                       \                                                                                                                 \     }                                                                                                           \     while(0)   void lu_device(float *src_d, int n, int *pivot) {     cublashandle_t handle;     cublascall(cublascreate_v2(&handle));      int batchsize = 1;      int *p, *info;      cudacall(cudamalloc<int>(&p,n * batchsize * sizeof(int)));     cudacall(cudamalloc<int>(&info,batchsize * sizeof(int)));      int lda = n;      float *a[] = { src_d };     float **a_d;     cudacall(cudamalloc<float*>(&a_d,sizeof(a)));     cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice));      cublascall(cublassgetrfbatched(handle,n,a_d,lda,p,info,batchsize));      int infoh = 0;     cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost));     cudacall(cudamemcpy(pivot,p,n*batchsize*sizeof(int),cudamemcpydevicetohost)); #ifdef debug_print     (int qq = 0; qq < n*batchsize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); } #endif      if(infoh == n)     {         fprintf(stderr, "factorization failed: matrix singular\n");         cudadevicereset();         exit(exit_failure);     }     cudafree(p); cudafree(info); cudafree(a_d); cublasdestroy(handle); }  void lu(float* src, float* l, float *u, int n, int *pivot) {     float *src_d;      cudacall(cudamalloc<float>(&src_d, n*n * sizeof(float)));     cudacall(cudamemcpy(src_d,src,n*n * sizeof(float),cudamemcpyhosttodevice));      lu_device(src_d,n,pivot);      cudacall(cudamemcpy(l,src_d,n * n * sizeof(float),cudamemcpydevicetohost));     cudacall(cudamemcpy(u,src_d,n * n * sizeof(float),cudamemcpydevicetohost));     (int = 0; < n; ++){       (int j = 0; j < i; j++)   l[i*n+j] = 0.0;       (int j = i+1; j < n; j++) u[i*n+j] = 0.0;}      cudafree(src_d); }  void rearrange(float *vec, int *pivot, int n, int dir){ #define dir_forward 0 #define dir_reverse 1 #define swap(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}   if (dir == dir_forward)     (int = 0; < n; i++)    swap((vec+i),(vec+pivot[i]-1))   else     (int = n-1; >= 0; i--) swap((vec+i),(vec+pivot[i]-1)) }   void trsm(float *a, float *x, float *b, int n, cublasfillmode_t uplo, cublasdiagtype_t diagt ){      cublashandle_t handle;     cublascall(cublascreate_v2(&handle));     float *a_d, *b_d;     cudacall(cudamalloc<float>(&a_d, n*n * sizeof(float)));     cudacall(cudamalloc<float>(&b_d, n * sizeof(float)));     cudacall(cudamemcpy(b_d, b,   n*sizeof(float), cudamemcpyhosttodevice));     cudacall(cudamemcpy(a_d, a, n*n*sizeof(float), cudamemcpyhosttodevice));     const float alpha = 1.0f;     cublascall(cublasstrsm(handle, cublas_side_left, uplo, cublas_op_n, diagt, n, 1, &alpha, a_d, n, b_d, n));     cudacall(cudamemcpy(x, b_d, n*sizeof(float), cudamemcpydevicetohost));     cudafree(a_d); cudafree(b_d); cublasdestroy(handle); }  void test_solve() {  // solve ax=b  // 1. perform lu on  // 2. using pivot sequence, rearrange b -> b'  // 3. perform trsm on ly=b'  // 4. perform trsm on ux=y  // = |0 1  4 |  //     |3 3  9 |  //     |4 10 16|  // x = |1|  //     |2|  //     |3|  // b = |14|  //     |36|  //     |72|      const int n = 3;  // has 3,2,3 pivot order     float          a_col_major[n*n] = { 0, 3, 4,                                         1, 3, 10,                                         4, 9, 16 };     float b1[n] = {14, 36, 72}; /* example - has 3,3,3 pivot order     float          a_transpose[n*n] = { 0, 1,  4,                                         3, 3,  9,                                         4, 10, 16 };     float b2[n] = {18, 37, 70}; */     float result_x[n];     int pivot[n];     float l[n*n];     float u[n*n];     float y[n];      //select matrix setting "a"     float *a = a_col_major;     float *b = b1;      printf("input:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             printf("%f\t",a[i*n+j]);         printf("\n");     }      printf("\n\n"); // 1. lu on     lu(a,l,u,n,pivot); #ifdef debug_print     printf("l:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             printf("%f\t",l[i*n+j]);         printf("\n");     }      printf("\n\n");     printf("u:\n\n");     for(int i=0; i<n; i++)     {         for(int j=0; j<n; j++)             printf("%f\t",u[i*n+j]);         printf("\n");     }      printf("\n\n");  #endif // 2. rearrange b     rearrange(b,pivot,n,dir_forward); #ifdef debug_print    (int = 0; < n; i++) printf("b'[%d] = %f\n", i, b[i]); #endif // 3. trsm on ly=b     trsm(l, y, b, n, cublas_fill_mode_lower, cublas_diag_unit); // 4. trsm on ux=y     trsm(u, result_x, y, n, cublas_fill_mode_upper, cublas_diag_non_unit);      fprintf(stdout, "solution:\n\n");     for(int i=0; i<n; i++)     {             printf("%f\n",result_x[i]);     }  }  int main() {     test_solve();      return 0; }  $ nvcc -o t853 t853.cu -lcublas $ ./t853 input:  0.000000        3.000000        4.000000 1.000000        3.000000        10.000000 4.000000        9.000000        16.000000   solution:  1.000000 2.000000 3.000000 $ 

note simple test case used cublas getrfbatched matrix lu factorization, rather lapack, think should behave lapack.

also note i'm not intending comment on "best approaches linear system solutions" merely explain how approach mapped out might made work.


Comments

Popular posts from this blog

yii2 - Yii 2 Running a Cron in the basic template -

asp.net - 'System.Web.HttpContext' does not contain a definition for 'GetOwinContext' Mystery -

mercurial graft feature, can it copy? -