cuda - cublasDtrsm after LU with pivoting -
i stuck @ small problem. i've got solve linear system a * x = b
.
the matrix a
gets decomposed lu-factorization (lapack
). result factorized matrix , pivotarray. after want solve 2 linear systems: u * x = y
, l * y = b
on gpu *cublasdtrsm*
. because of row interchanges dgetrf
in lapack
have pass pivot array cublas
. *cublasdtrsm*
-function don't offers this. without pivot array wrong results.
i searched disabling pivoting in lapack
, regarding stability it's not possible. there hint how solve linear equation system lu-factorization?
if wanted use particular approach (cublas trsm after lapack getrf), believe should able use cublas trsm l,u output of lapack rearranging b vector (or matrix) match rearrangement order lapack performed during pivoting. believe order given in formula ipiv
in lapack documentation:
ipiv
ipiv integer array, dimension (min(m,n)) pivot indices; 1 <= <= min(m,n), row of matrix interchanged row ipiv(i).
here's sample code demonstrates idea simple 3x3 test case single rhs vector:
$ cat t853.cu #include <cstdio> #include <cstdlib> #include <cuda_runtime.h> #include <cublas_v2.h> #define cudacall(call) \ \ { \ cudaerror_t err = (call); \ if(cudasuccess != err) \ { \ fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err)); \ cudadevicereset(); \ exit(exit_failure); \ } \ } \ while (0) #define cublascall(call) \ \ { \ cublasstatus_t status = (call); \ if(cublas_status_success != status) \ { \ fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status); \ cudadevicereset(); \ exit(exit_failure); \ } \ \ } \ while(0) void lu_device(float *src_d, int n, int *pivot) { cublashandle_t handle; cublascall(cublascreate_v2(&handle)); int batchsize = 1; int *p, *info; cudacall(cudamalloc<int>(&p,n * batchsize * sizeof(int))); cudacall(cudamalloc<int>(&info,batchsize * sizeof(int))); int lda = n; float *a[] = { src_d }; float **a_d; cudacall(cudamalloc<float*>(&a_d,sizeof(a))); cudacall(cudamemcpy(a_d,a,sizeof(a),cudamemcpyhosttodevice)); cublascall(cublassgetrfbatched(handle,n,a_d,lda,p,info,batchsize)); int infoh = 0; cudacall(cudamemcpy(&infoh,info,sizeof(int),cudamemcpydevicetohost)); cudacall(cudamemcpy(pivot,p,n*batchsize*sizeof(int),cudamemcpydevicetohost)); #ifdef debug_print (int qq = 0; qq < n*batchsize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); } #endif if(infoh == n) { fprintf(stderr, "factorization failed: matrix singular\n"); cudadevicereset(); exit(exit_failure); } cudafree(p); cudafree(info); cudafree(a_d); cublasdestroy(handle); } void lu(float* src, float* l, float *u, int n, int *pivot) { float *src_d; cudacall(cudamalloc<float>(&src_d, n*n * sizeof(float))); cudacall(cudamemcpy(src_d,src,n*n * sizeof(float),cudamemcpyhosttodevice)); lu_device(src_d,n,pivot); cudacall(cudamemcpy(l,src_d,n * n * sizeof(float),cudamemcpydevicetohost)); cudacall(cudamemcpy(u,src_d,n * n * sizeof(float),cudamemcpydevicetohost)); (int = 0; < n; ++){ (int j = 0; j < i; j++) l[i*n+j] = 0.0; (int j = i+1; j < n; j++) u[i*n+j] = 0.0;} cudafree(src_d); } void rearrange(float *vec, int *pivot, int n, int dir){ #define dir_forward 0 #define dir_reverse 1 #define swap(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;} if (dir == dir_forward) (int = 0; < n; i++) swap((vec+i),(vec+pivot[i]-1)) else (int = n-1; >= 0; i--) swap((vec+i),(vec+pivot[i]-1)) } void trsm(float *a, float *x, float *b, int n, cublasfillmode_t uplo, cublasdiagtype_t diagt ){ cublashandle_t handle; cublascall(cublascreate_v2(&handle)); float *a_d, *b_d; cudacall(cudamalloc<float>(&a_d, n*n * sizeof(float))); cudacall(cudamalloc<float>(&b_d, n * sizeof(float))); cudacall(cudamemcpy(b_d, b, n*sizeof(float), cudamemcpyhosttodevice)); cudacall(cudamemcpy(a_d, a, n*n*sizeof(float), cudamemcpyhosttodevice)); const float alpha = 1.0f; cublascall(cublasstrsm(handle, cublas_side_left, uplo, cublas_op_n, diagt, n, 1, &alpha, a_d, n, b_d, n)); cudacall(cudamemcpy(x, b_d, n*sizeof(float), cudamemcpydevicetohost)); cudafree(a_d); cudafree(b_d); cublasdestroy(handle); } void test_solve() { // solve ax=b // 1. perform lu on // 2. using pivot sequence, rearrange b -> b' // 3. perform trsm on ly=b' // 4. perform trsm on ux=y // = |0 1 4 | // |3 3 9 | // |4 10 16| // x = |1| // |2| // |3| // b = |14| // |36| // |72| const int n = 3; // has 3,2,3 pivot order float a_col_major[n*n] = { 0, 3, 4, 1, 3, 10, 4, 9, 16 }; float b1[n] = {14, 36, 72}; /* example - has 3,3,3 pivot order float a_transpose[n*n] = { 0, 1, 4, 3, 3, 9, 4, 10, 16 }; float b2[n] = {18, 37, 70}; */ float result_x[n]; int pivot[n]; float l[n*n]; float u[n*n]; float y[n]; //select matrix setting "a" float *a = a_col_major; float *b = b1; printf("input:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) printf("%f\t",a[i*n+j]); printf("\n"); } printf("\n\n"); // 1. lu on lu(a,l,u,n,pivot); #ifdef debug_print printf("l:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) printf("%f\t",l[i*n+j]); printf("\n"); } printf("\n\n"); printf("u:\n\n"); for(int i=0; i<n; i++) { for(int j=0; j<n; j++) printf("%f\t",u[i*n+j]); printf("\n"); } printf("\n\n"); #endif // 2. rearrange b rearrange(b,pivot,n,dir_forward); #ifdef debug_print (int = 0; < n; i++) printf("b'[%d] = %f\n", i, b[i]); #endif // 3. trsm on ly=b trsm(l, y, b, n, cublas_fill_mode_lower, cublas_diag_unit); // 4. trsm on ux=y trsm(u, result_x, y, n, cublas_fill_mode_upper, cublas_diag_non_unit); fprintf(stdout, "solution:\n\n"); for(int i=0; i<n; i++) { printf("%f\n",result_x[i]); } } int main() { test_solve(); return 0; } $ nvcc -o t853 t853.cu -lcublas $ ./t853 input: 0.000000 3.000000 4.000000 1.000000 3.000000 10.000000 4.000000 9.000000 16.000000 solution: 1.000000 2.000000 3.000000 $
note simple test case used cublas getrfbatched matrix lu factorization, rather lapack, think should behave lapack.
also note i'm not intending comment on "best approaches linear system solutions" merely explain how approach mapped out might made work.
Comments
Post a Comment